update patches
This commit is contained in:
parent
18fdcc94e5
commit
c7d1f258aa
|
|
@ -1,6 +1,6 @@
|
|||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||
WORKDIR=llama/vendor
|
||||
FETCH_HEAD=ec98e2002
|
||||
FETCH_HEAD=85c40c9b02941ebf1add1469af75f1796d513ef4
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ index 8547ecc84..9f37ca70c 100644
|
|||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
||||
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
index da624c587..efc63e092 100644
|
||||
index e90759f98..91421a51d 100644
|
||||
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
|
||||
|
|
@ -84,7 +84,7 @@ index da624c587..efc63e092 100644
|
|||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index ab0f6fe9c..6519af435 100644
|
||||
index 55fa2e6a7..58eaf45b4 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
|
|
@ -132,10 +132,10 @@ index 70bf6f3d9..f2b7fe692 100644
|
|||
|
||||
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
index 0d37587f6..ff373d413 100644
|
||||
index 639715537..84d9f93f3 100644
|
||||
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
@@ -3417,6 +3417,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
@@ -3421,6 +3421,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
|
|
@ -144,7 +144,7 @@ index 0d37587f6..ff373d413 100644
|
|||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index 18a45d2d9..89041805e 100644
|
||||
index e7890a5ee..d1f38235a 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -556,6 +556,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
|
|
@ -184,10 +184,10 @@ index e996d98be..84b679315 100644
|
|||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 34ec09d40..120191ca0 100644
|
||||
index 1459b2608..8ca9e4403 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -12447,6 +12447,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
|
|
@ -195,7 +195,7 @@ index 34ec09d40..120191ca0 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -12590,6 +12591,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ logs instead of throwing an error
|
|||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 7b01a2edf..63250cdf1 100644
|
||||
index cd4092ca0..af2276960 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
@ -31,7 +31,7 @@ index 7b01a2edf..63250cdf1 100644
|
|||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -2016,7 +2007,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ filesystems for paths that include wide characters
|
|||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 35e3aef0a..84a3796b5 100644
|
||||
index 3ba0823de..11a248963 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -24,6 +24,19 @@
|
||||
|
|
@ -33,7 +33,7 @@ index 35e3aef0a..84a3796b5 100644
|
|||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
@@ -1619,7 +1632,29 @@ struct clip_model_loader {
|
||||
@@ -1678,7 +1691,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
|
|
@ -63,7 +63,7 @@ index 35e3aef0a..84a3796b5 100644
|
|||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -1646,7 +1681,11 @@ struct clip_model_loader {
|
||||
@@ -1705,7 +1740,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,10 +19,10 @@ adds support for the Solar Pro architecture
|
|||
create mode 100644 src/models/solar.cpp
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 4192af7c0..bd44d73e7 100644
|
||||
index 1e155534b..159f429e8 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -125,6 +125,7 @@ add_library(llama
|
||||
@@ -127,6 +127,7 @@ add_library(llama
|
||||
models/seed-oss.cpp
|
||||
models/smallthinker.cpp
|
||||
models/smollm3.cpp
|
||||
|
|
@ -31,10 +31,10 @@ index 4192af7c0..bd44d73e7 100644
|
|||
models/starcoder.cpp
|
||||
models/starcoder2.cpp
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 8caf80afc..2ce8ffec0 100644
|
||||
index 75013d8d3..22b30bfcc 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -87,6 +87,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -88,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
|
|
@ -42,7 +42,7 @@ index 8caf80afc..2ce8ffec0 100644
|
|||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
@@ -208,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -212,6 +213,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
|
|
@ -50,7 +50,7 @@ index 8caf80afc..2ce8ffec0 100644
|
|||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -339,6 +341,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||
@@ -344,6 +346,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||
|
|
@ -58,7 +58,7 @@ index 8caf80afc..2ce8ffec0 100644
|
|||
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
@@ -2176,6 +2179,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
@@ -2217,6 +2220,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
};
|
||||
|
|
@ -81,7 +81,7 @@ index 8caf80afc..2ce8ffec0 100644
|
|||
default:
|
||||
GGML_ABORT("unknown architecture for tensor mapping");
|
||||
}
|
||||
@@ -2344,6 +2363,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -2385,6 +2404,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
|
|
@ -90,10 +90,10 @@ index 8caf80afc..2ce8ffec0 100644
|
|||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 6cbf9b1f8..14d461c76 100644
|
||||
index 27bdedc83..06c903bb6 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -91,6 +91,7 @@ enum llm_arch {
|
||||
@@ -92,6 +92,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_GRANITE_HYBRID,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
|
|
@ -101,7 +101,7 @@ index 6cbf9b1f8..14d461c76 100644
|
|||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
@@ -212,6 +213,7 @@ enum llm_kv {
|
||||
@@ -216,6 +217,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
|
|
@ -109,7 +109,7 @@ index 6cbf9b1f8..14d461c76 100644
|
|||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -465,6 +467,7 @@ enum llm_tensor {
|
||||
@@ -470,6 +472,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
|
|
@ -137,7 +137,7 @@ index fe1fa4341..aabff2f06 100644
|
|||
if (il < n_layer) {
|
||||
return swa_layers[il];
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index f6e95b5d2..c6e673276 100644
|
||||
index 42def73f0..d3c53b5f2 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -65,6 +65,8 @@ struct llama_hparams {
|
||||
|
|
@ -149,7 +149,7 @@ index f6e95b5d2..c6e673276 100644
|
|||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -259,6 +261,9 @@ struct llama_hparams {
|
||||
@@ -260,6 +262,9 @@ struct llama_hparams {
|
||||
|
||||
uint32_t n_pos_per_embd() const;
|
||||
|
||||
|
|
@ -160,10 +160,10 @@ index f6e95b5d2..c6e673276 100644
|
|||
|
||||
bool has_kv(uint32_t il) const;
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index ca2ea2461..8916a6242 100644
|
||||
index 5003b4fbf..243b296b5 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -466,7 +466,7 @@ namespace GGUFMeta {
|
||||
@@ -489,7 +489,7 @@ namespace GGUFMeta {
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
|
||||
|
|
@ -173,10 +173,10 @@ index ca2ea2461..8916a6242 100644
|
|||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index ae8207ee1..00cd579e0 100644
|
||||
index 69075742c..bdee9b6e6 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1995,6 +1995,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -2028,6 +2028,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -198,7 +198,7 @@ index ae8207ee1..00cd579e0 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -5429,6 +5444,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -5510,6 +5525,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -233,7 +233,7 @@ index ae8207ee1..00cd579e0 100644
|
|||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -7534,6 +7577,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
@@ -7664,6 +7707,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
||||
} break;
|
||||
|
|
@ -244,7 +244,7 @@ index ae8207ee1..00cd579e0 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
||||
@@ -7798,6 +7845,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -7932,6 +7979,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
|
|
@ -253,10 +253,10 @@ index ae8207ee1..00cd579e0 100644
|
|||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index c6eb95318..b378b23ec 100644
|
||||
index 9c00eec75..858af51bb 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -76,6 +76,7 @@ enum llm_type {
|
||||
@@ -79,6 +79,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
|
|
@ -264,7 +264,7 @@ index c6eb95318..b378b23ec 100644
|
|||
LLM_TYPE_26B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
@@ -405,6 +406,8 @@ struct llama_layer {
|
||||
@@ -409,6 +410,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_act_beta = nullptr;
|
||||
struct ggml_tensor * ffn_act_eps = nullptr;
|
||||
|
||||
|
|
@ -274,10 +274,10 @@ index c6eb95318..b378b23ec 100644
|
|||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/models/models.h b/src/models/models.h
|
||||
index ffb36acc6..6d84a185d 100644
|
||||
index dd0e286ed..40f61b59d 100644
|
||||
--- a/src/models/models.h
|
||||
+++ b/src/models/models.h
|
||||
@@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
|
||||
@@ -525,6 +525,11 @@ struct llm_build_smollm3 : public llm_graph_context {
|
||||
llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ regex
|
|||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 63250cdf1..dd86a1745 100644
|
||||
index af2276960..e05314272 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
|||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 4c04c3300..f4747f262 100644
|
||||
index 262d78a4c..76cb339ca 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -345,6 +345,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
|
|||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index f4747f262..d55aed348 100644
|
||||
index 76cb339ca..676fb5b5e 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -365,10 +365,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ index b165d8bdc..f91d4faba 100644
|
|||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index dd86a1745..d63ce9c84 100644
|
||||
index e05314272..325ef9843 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
|||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index a59b51893..53891a91f 100644
|
||||
index f7ba1fe31..f700f74db 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
|
|
|
|||
|
|
@ -183,10 +183,10 @@ index a4c978ac1..5c0da4049 100644
|
|||
const char * grammar_root,
|
||||
bool lazy,
|
||||
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
||||
index 3f4a729bc..38a30ea05 100644
|
||||
index d96f619ae..237eb8655 100644
|
||||
--- a/src/llama-sampling.cpp
|
||||
+++ b/src/llama-sampling.cpp
|
||||
@@ -1561,7 +1561,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
@@ -1577,7 +1577,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||
}
|
||||
|
||||
|
|
@ -195,7 +195,7 @@ index 3f4a729bc..38a30ea05 100644
|
|||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||
|
||||
@@ -1639,9 +1639,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
@@ -1655,9 +1655,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
trigger_pattern += ")[\\s\\S]*";
|
||||
|
||||
std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ index a7ebe5dcd..03557bb31 100644
|
|||
size_t memory_total;
|
||||
// device type
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 6519af435..c9d3a2b03 100644
|
||||
index 58eaf45b4..693d5dd7c 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
|
|
@ -108,7 +108,7 @@ index 6519af435..c9d3a2b03 100644
|
|||
std::string device_name(prop.name);
|
||||
if (device_name == "NVIDIA GeForce MX450") {
|
||||
turing_devices_without_mma.push_back({ id, device_name });
|
||||
@@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -4120,6 +4167,7 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
|
|
@ -116,7 +116,7 @@ index 6519af435..c9d3a2b03 100644
|
|||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
|
||||
@@ -4208,6 +4256,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
|
||||
}
|
||||
#endif // defined(__linux__)
|
||||
|
||||
|
|
@ -128,7 +128,7 @@ index 6519af435..c9d3a2b03 100644
|
|||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4248,6 +4301,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
|
|
@ -136,7 +136,7 @@ index 6519af435..c9d3a2b03 100644
|
|||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
@@ -4834,6 +4888,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4844,6 +4898,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
|
|||
2 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
|
||||
index 2638fe4fc..c4e905a4e 100644
|
||||
index b9c4fa909..3b47aed0e 100644
|
||||
--- a/tools/mtmd/mtmd.cpp
|
||||
+++ b/tools/mtmd/mtmd.cpp
|
||||
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] no power throttling win32 with gnuc
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 53891a91f..8d4851312 100644
|
||||
index f700f74db..5581dd0ae 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -2479,7 +2479,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
|
|
|
|||
|
|
@ -178,7 +178,7 @@ index f4713a421..92ba577a5 100644
|
|||
|
||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index c9d3a2b03..25548629d 100644
|
||||
index 693d5dd7c..ed33f8f20 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2901,7 +2901,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
|
|
@ -241,7 +241,7 @@ index c9d3a2b03..25548629d 100644
|
|||
}
|
||||
|
||||
if (!use_cuda_graph) {
|
||||
@@ -3742,7 +3752,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3752,7 +3762,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -250,7 +250,7 @@ index c9d3a2b03..25548629d 100644
|
|||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
@@ -3780,7 +3790,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
@@ -3790,7 +3800,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
if (use_cuda_graph) {
|
||||
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
|
||||
|
|
@ -278,10 +278,10 @@ index 8fc1c2fb5..ba95b4acc 100644
|
|||
|
||||
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 120191ca0..5349bce24 100644
|
||||
index 8ca9e4403..3990d67cf 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -13099,7 +13099,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
@@ -13216,7 +13216,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
return num_adds;
|
||||
}
|
||||
|
||||
|
|
@ -290,7 +290,7 @@ index 120191ca0..5349bce24 100644
|
|||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
@@ -13334,6 +13334,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
@@ -13482,6 +13482,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
UNUSED(backend);
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
|
||||
index f68829a61..2024d3d37 100644
|
||||
index e99101184..a66d36f84 100644
|
||||
--- a/tools/mtmd/mtmd-audio.cpp
|
||||
+++ b/tools/mtmd/mtmd-audio.cpp
|
||||
@@ -1,6 +1,6 @@
|
||||
|
|
|
|||
|
|
@ -226,7 +226,7 @@ index 498186a7c..7746e8b92 100644
|
|||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||
index 9fcb2f9fd..e800ee8f6 100644
|
||||
index 62e618850..dac9cfcdf 100644
|
||||
--- a/ggml/src/ggml-cuda/common.cuh
|
||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||
@@ -37,6 +37,41 @@
|
||||
|
|
@ -271,7 +271,7 @@ index 9fcb2f9fd..e800ee8f6 100644
|
|||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
|
||||
@@ -941,6 +976,9 @@ struct ggml_cuda_pool {
|
||||
@@ -976,6 +1011,9 @@ struct ggml_cuda_pool {
|
||||
|
||||
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
||||
virtual void free(void * ptr, size_t size) = 0;
|
||||
|
|
@ -281,7 +281,7 @@ index 9fcb2f9fd..e800ee8f6 100644
|
|||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -1232,11 +1270,15 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1267,11 +1305,15 @@ struct ggml_backend_cuda_context {
|
||||
// pool
|
||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
|
||||
|
||||
|
|
@ -299,7 +299,7 @@ index 9fcb2f9fd..e800ee8f6 100644
|
|||
}
|
||||
return *pools[device][curr_stream_no];
|
||||
}
|
||||
@@ -1244,6 +1286,22 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1279,6 +1321,22 @@ struct ggml_backend_cuda_context {
|
||||
ggml_cuda_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
|
|
@ -323,7 +323,7 @@ index 9fcb2f9fd..e800ee8f6 100644
|
|||
|
||||
struct ggml_cuda_mm_fusion_args_host {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 25548629d..eeaae3fe4 100644
|
||||
index ed33f8f20..3a7fd31e0 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -365,6 +365,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
|
|
@ -567,7 +567,7 @@ index 25548629d..eeaae3fe4 100644
|
|||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
@@ -3274,6 +3338,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
@@ -3284,6 +3348,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
|
|
@ -575,7 +575,7 @@ index 25548629d..eeaae3fe4 100644
|
|||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
@@ -3410,6 +3475,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3420,6 +3485,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -586,7 +586,7 @@ index 25548629d..eeaae3fe4 100644
|
|||
|
||||
// start of fusion operations
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
@@ -3754,6 +3823,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3764,6 +3833,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
|
@ -594,7 +594,7 @@ index 25548629d..eeaae3fe4 100644
|
|||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
@@ -3829,6 +3899,77 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
@@ -3839,6 +3909,77 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -672,7 +672,7 @@ index 25548629d..eeaae3fe4 100644
|
|||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
@@ -4097,6 +4238,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
@@ -4107,6 +4248,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .graph_optimize = */ ggml_backend_cuda_graph_optimize,
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
|
|||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 8786d4ee3..9e6998272 100644
|
||||
index 015ebae71..e346e7231 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -1051,8 +1051,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
@@ -1050,8 +1050,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ index 7746e8b92..189e97170 100644
|
|||
GGML_ASSERT(device);
|
||||
return device->iface.get_buffer_type(device);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index eeaae3fe4..6852d2e20 100644
|
||||
index 3a7fd31e0..cfe21ebc7 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -113,6 +113,11 @@ int ggml_cuda_get_device() {
|
||||
|
|
@ -77,7 +77,7 @@ index eeaae3fe4..6852d2e20 100644
|
|||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
cudaError_t err;
|
||||
@@ -4448,7 +4453,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4458,7 +4463,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
props->id = ggml_backend_cuda_device_get_id(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
|
|
@ -89,7 +89,7 @@ index eeaae3fe4..6852d2e20 100644
|
|||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
@@ -4908,6 +4916,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
@@ -4918,6 +4926,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||
}
|
||||
|
||||
|
|
@ -101,7 +101,7 @@ index eeaae3fe4..6852d2e20 100644
|
|||
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
||||
@@ -4924,6 +4937,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
@@ -4934,6 +4947,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
||||
|
|
@ -122,10 +122,10 @@ index 951a88d56..4e162258d 100644
|
|||
#define cudaError_t hipError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index f69964b6d..759152b76 100644
|
||||
index 1e18637e3..ad0f45812 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -921,10 +921,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
@@ -934,10 +934,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
for (auto * dev : model->devices) {
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
|
|
|
|||
|
|
@ -3,16 +3,6 @@ From: Daniel Hiltgen <daniel@ollama.com>
|
|||
Date: Tue, 26 Aug 2025 12:48:29 -0700
|
||||
Subject: [PATCH] GPU discovery enhancements
|
||||
|
||||
Expose more information about the devices through backend props, and leverage
|
||||
management libraries for more accurate VRAM usage reporting if available.
|
||||
|
||||
vulkan: get GPU ID (ollama v0.11.5)
|
||||
|
||||
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
|
||||
|
||||
Vulkan PCI and Memory
|
||||
|
||||
fix vulkan PCI ID and ID handling
|
||||
---
|
||||
ggml/include/ggml-backend.h | 6 +
|
||||
ggml/src/CMakeLists.txt | 2 +
|
||||
|
|
@ -45,7 +35,7 @@ index 92ca32a4b..6ad583f09 100644
|
|||
|
||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index d55aed348..99ae293cc 100644
|
||||
index 676fb5b5e..6283c2d30 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -205,6 +205,8 @@ add_library(ggml-base
|
||||
|
|
@ -58,7 +48,7 @@ index d55aed348..99ae293cc 100644
|
|||
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 6852d2e20..334a30135 100644
|
||||
index cfe21ebc7..53ce7827c 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -267,6 +267,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
|
|
@ -90,7 +80,7 @@ index 6852d2e20..334a30135 100644
|
|||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
@@ -4317,6 +4332,11 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -4327,6 +4342,11 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
std::string id;
|
||||
|
|
@ -102,7 +92,7 @@ index 6852d2e20..334a30135 100644
|
|||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -4413,6 +4433,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
@@ -4423,6 +4443,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
|
|
@ -131,7 +121,7 @@ index 6852d2e20..334a30135 100644
|
|||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17368
|
||||
@@ -4445,6 +4487,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
@@ -4455,6 +4497,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
|
|
@ -139,7 +129,7 @@ index 6852d2e20..334a30135 100644
|
|||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
@@ -4458,6 +4501,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4468,6 +4511,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
||||
props->memory_total = props->memory_free = 0;
|
||||
|
||||
|
|
@ -159,7 +149,7 @@ index 6852d2e20..334a30135 100644
|
|||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
bool events = false;
|
||||
@@ -5047,6 +5103,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -5057,6 +5113,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
|
|
@ -167,7 +157,7 @@ index 6852d2e20..334a30135 100644
|
|||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
@@ -5062,6 +5119,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -5072,6 +5129,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
|
||||
|
|
@ -243,7 +233,7 @@ index ba95b4acc..f6f8f7a10 100644
|
|||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 5349bce24..0103fd03a 100644
|
||||
index 3990d67cf..f3e65990d 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -236,6 +236,7 @@ class vk_memory_logger;
|
||||
|
|
@ -254,7 +244,7 @@ index 5349bce24..0103fd03a 100644
|
|||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -12350,6 +12351,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
@@ -12432,6 +12433,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
snprintf(description, description_size, "%s", props.deviceName.data());
|
||||
}
|
||||
|
||||
|
|
@ -284,7 +274,7 @@ index 5349bce24..0103fd03a 100644
|
|||
// backend interface
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
@@ -13628,15 +13652,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
@@ -13828,15 +13852,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
|
|
@ -361,7 +351,7 @@ index 5349bce24..0103fd03a 100644
|
|||
|
||||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
@@ -13688,8 +13769,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13888,8 +13969,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -376,7 +366,7 @@ index 5349bce24..0103fd03a 100644
|
|||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
@@ -13706,19 +13792,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13906,19 +13992,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
|
|
@ -410,7 +400,7 @@ index 5349bce24..0103fd03a 100644
|
|||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
@@ -13730,9 +13821,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
@@ -13930,9 +14021,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
|
|
@ -426,7 +416,7 @@ index 5349bce24..0103fd03a 100644
|
|||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
@@ -13756,8 +13852,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13956,8 +14052,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
|
|
@ -436,10 +426,10 @@ index 5349bce24..0103fd03a 100644
|
|||
+ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
@@ -13765,6 +13862,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
/* .async = */ true,
|
||||
@@ -13965,6 +14062,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
/* .events = */ true,
|
||||
};
|
||||
+
|
||||
+ props->compute_major = ctx->major;
|
||||
|
|
@ -451,7 +441,7 @@ index 5349bce24..0103fd03a 100644
|
|||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -14331,6 +14435,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14573,6 +14677,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
|
|
@ -460,7 +450,7 @@ index 5349bce24..0103fd03a 100644
|
|||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
@@ -14339,12 +14445,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14581,12 +14687,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
|
|
|||
Loading…
Reference in New Issue