From f5d663e370a7bdb0acda66a448f21cc8f96f9b83 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 7 Jun 2025 12:36:59 -0400 Subject: [PATCH] update patches --- ...loc-and-free-using-the-same-compiler.patch | 38 +++++++++--------- llama/patches/0002-pretokenizer.patch | 2 +- llama/patches/0003-embeddings.patch | 8 ++-- llama/patches/0004-clip-unicode.patch | 8 ++-- llama/patches/0005-solar-pro.patch | 40 +++++++++---------- .../0006-fix-deepseek-deseret-regex.patch | 2 +- ...ntain-ordering-for-rules-for-grammar.patch | 22 ++++++++++ ...patch => 0008-sort-devices-by-score.patch} | 0 ...arget-ggml-cpu-for-all-cpu-variants.patch} | 12 +++--- llama/patches/0010-remove-amx.patch | 25 ++++++++++++ ...h => 0011-fix-string-arr-kv-loading.patch} | 8 ++-- llama/patches/0011-remove-amx.patch | 25 ------------ ...r.patch => 0012-ollama-debug-tensor.patch} | 4 +- ...dd-ollama-vocab-for-grammar-support.patch} | 8 ++-- ...4-add-argsort-and-cuda-copy-for-i32.patch} | 16 ++++---- ...5-graph-memory-reporting-on-failure.patch} | 4 +- ...patch => 0016-ggml-Export-GPU-UUIDs.patch} | 14 +++---- 17 files changed, 129 insertions(+), 107 deletions(-) create mode 100644 llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch rename llama/patches/{0009-sort-devices-by-score.patch => 0008-sort-devices-by-score.patch} (100%) rename llama/patches/{0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch => 0009-add-phony-target-ggml-cpu-for-all-cpu-variants.patch} (71%) create mode 100644 llama/patches/0010-remove-amx.patch rename llama/patches/{0012-fix-string-arr-kv-loading.patch => 0011-fix-string-arr-kv-loading.patch} (94%) delete mode 100644 llama/patches/0011-remove-amx.patch rename llama/patches/{0013-ollama-debug-tensor.patch => 0012-ollama-debug-tensor.patch} (91%) rename llama/patches/{0014-add-ollama-vocab-for-grammar-support.patch => 0013-add-ollama-vocab-for-grammar-support.patch} (97%) rename llama/patches/{0015-add-argsort-and-cuda-copy-for-i32.patch => 0014-add-argsort-and-cuda-copy-for-i32.patch} (96%) rename llama/patches/{0016-graph-memory-reporting-on-failure.patch => 0015-graph-memory-reporting-on-failure.patch} (98%) rename llama/patches/{0017-ggml-Export-GPU-UUIDs.patch => 0016-ggml-Export-GPU-UUIDs.patch} (92%) diff --git a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch index edeeb4ffa..4f569c8f3 100644 --- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch +++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch @@ -24,7 +24,7 @@ problem. 9 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index b30b4cb3..0ce73a99 100644 +index b1050ad5..e8694e5c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -43,7 +43,7 @@ index b30b4cb3..0ce73a99 100644 } static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { -@@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +@@ -1879,6 +1879,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_aligned_free(buffer->context, buffer->size); @@ -55,7 +55,7 @@ index b30b4cb3..0ce73a99 100644 } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { -@@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { +@@ -1926,7 +1931,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { }; static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { @@ -65,10 +65,10 @@ index b30b4cb3..0ce73a99 100644 /* .init_tensor = */ NULL, // no initialization required /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp -index e2617b06..242e50a7 100644 +index c0ea2600..6c3398da 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp -@@ -800,6 +800,7 @@ static void ggml_backend_cann_buffer_free_buffer( +@@ -801,6 +801,7 @@ static void ggml_backend_cann_buffer_free_buffer( ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; delete ctx; @@ -76,7 +76,7 @@ index e2617b06..242e50a7 100644 } /** -@@ -1472,6 +1473,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf +@@ -1473,6 +1474,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf */ static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { ACL_CHECK(aclrtFreeHost(buffer->context)); @@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644 /** diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index b4b85abc..cb0d8528 100644 +index 2a6f7f10..ec031650 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context { @@ -104,7 +104,7 @@ index b4b85abc..cb0d8528 100644 } static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ +@@ -1071,6 +1073,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) { static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { CUDA_CHECK(cudaFreeHost(buffer->context)); @@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 576f9581..1b56f858 100644 +index bc93bc63..fd3a9d1b 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) +@@ -5272,6 +5272,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } free(ctx); @@ -137,10 +137,10 @@ index 576f9581..1b56f858 100644 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp -index 05a2f4e6..392cc18d 100644 +index 80a36438..6abb0ab2 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp -@@ -1940,6 +1940,7 @@ struct ggml_backend_opencl_buffer_context { +@@ -2366,6 +2366,7 @@ struct ggml_backend_opencl_buffer_context { static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; delete ctx; @@ -161,10 +161,10 @@ index 4f0abb5a..de1ec184 100644 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp -index 0ea72994..ae3a3c33 100644 +index 78513114..0dabdfe7 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp -@@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { +@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { ggml_sycl_set_device(ctx->device); delete ctx; @@ -172,7 +172,7 @@ index 0ea72994..ae3a3c33 100644 } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ -@@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context { +@@ -791,6 +792,7 @@ struct ggml_backend_sycl_split_buffer_context { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; delete ctx; @@ -180,7 +180,7 @@ index 0ea72994..ae3a3c33 100644 } static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ +@@ -1133,6 +1135,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_sycl_host_free(buffer->context); @@ -189,10 +189,10 @@ index 0ea72994..ae3a3c33 100644 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index e2b357fd..68768029 100644 +index 3e43b03b..01776f3d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -@@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { +@@ -9272,6 +9272,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; @@ -200,7 +200,7 @@ index e2b357fd..68768029 100644 } static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe +@@ -9415,6 +9416,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(vk_instance.devices[0], buffer->context); diff --git a/llama/patches/0002-pretokenizer.patch b/llama/patches/0002-pretokenizer.patch index 07aa4b0ea..3caf5287a 100644 --- a/llama/patches/0002-pretokenizer.patch +++ b/llama/patches/0002-pretokenizer.patch @@ -10,7 +10,7 @@ logs instead of throwing an error 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 9389ca80..806c1b3d 100644 +index ba2e1864..0d7ad157 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { diff --git a/llama/patches/0003-embeddings.patch b/llama/patches/0003-embeddings.patch index 80d6b55e5..246cd2919 100644 --- a/llama/patches/0003-embeddings.patch +++ b/llama/patches/0003-embeddings.patch @@ -11,10 +11,10 @@ instead of forcing one or the error 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 62246c10..dca22d8b 100644 +index c29fe7e4..148d1132 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -952,7 +952,7 @@ int llama_context::decode(llama_batch & inp_batch) { int64_t n_outputs_all = 0; // count outputs @@ -23,7 +23,7 @@ index 62246c10..dca22d8b 100644 for (uint32_t i = 0; i < n_tokens_all; ++i) { n_outputs_all += batch.logits[i] != 0; } -@@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1083,7 +1083,7 @@ int llama_context::decode(llama_batch & inp_batch) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} @@ -32,7 +32,7 @@ index 62246c10..dca22d8b 100644 auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; if (t_embd && res->get_embd_pooled()) { -@@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { +@@ -1244,7 +1244,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead diff --git a/llama/patches/0004-clip-unicode.patch b/llama/patches/0004-clip-unicode.patch index 957109783..5a3b1e43d 100644 --- a/llama/patches/0004-clip-unicode.patch +++ b/llama/patches/0004-clip-unicode.patch @@ -10,10 +10,10 @@ filesystems for paths that include wide characters 1 file changed, 39 insertions(+) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp -index 41ba45a7..cdd8ca44 100644 +index c25bacc1..b3f92814 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp -@@ -31,6 +31,19 @@ +@@ -28,6 +28,19 @@ #include #include @@ -33,7 +33,7 @@ index 41ba45a7..cdd8ca44 100644 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; enum ffn_op_type { -@@ -2190,7 +2203,29 @@ struct clip_model_loader { +@@ -2552,7 +2565,29 @@ struct clip_model_loader { { std::vector read_buf; @@ -63,7 +63,7 @@ index 41ba45a7..cdd8ca44 100644 if (!fin) { throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); } -@@ -2217,7 +2252,11 @@ struct clip_model_loader { +@@ -2579,7 +2614,11 @@ struct clip_model_loader { ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); } } diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch index deb53c225..be6ec4c29 100644 --- a/llama/patches/0005-solar-pro.patch +++ b/llama/patches/0005-solar-pro.patch @@ -15,7 +15,7 @@ adds support for the Solar Pro architecture 7 files changed, 248 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index f2bc8ca7..5ab3f572 100644 +index c0590e10..6d9f0719 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -69,6 +69,7 @@ static const std::map LLM_ARCH_NAMES = { @@ -34,7 +34,7 @@ index f2bc8ca7..5ab3f572 100644 { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, -@@ -1502,6 +1504,24 @@ static const std::map> LLM_TENSOR_N +@@ -1508,6 +1510,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, }, @@ -59,7 +59,7 @@ index f2bc8ca7..5ab3f572 100644 { LLM_ARCH_WAVTOKENIZER_DEC, { -@@ -1680,6 +1700,7 @@ static const std::map LLM_TENSOR_INFOS = { +@@ -1686,6 +1706,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, @@ -68,7 +68,7 @@ index f2bc8ca7..5ab3f572 100644 {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h -index 41a023da..525c1b7d 100644 +index 930cb4ec..591bc14e 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -73,6 +73,7 @@ enum llm_arch { @@ -87,7 +87,7 @@ index 41a023da..525c1b7d 100644 LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, -@@ -346,6 +348,7 @@ enum llm_tensor { +@@ -348,6 +350,7 @@ enum llm_tensor { LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, @@ -96,10 +96,10 @@ index 41a023da..525c1b7d 100644 LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_NORM, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp -index 90dfe7a7..8a667960 100644 +index 1499eb08..aa7a4b23 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp -@@ -70,6 +70,14 @@ uint32_t llama_hparams::n_embd_v_s() const { +@@ -86,6 +86,14 @@ uint32_t llama_hparams::n_embd_v_s() const { return ssm_d_state * ssm_d_inner; } @@ -113,12 +113,12 @@ index 90dfe7a7..8a667960 100644 + bool llama_hparams::is_swa(uint32_t il) const { if (il < n_layer) { - return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); + return swa_layers[il]; diff --git a/src/llama-hparams.h b/src/llama-hparams.h -index 7ee6a5b7..48dce407 100644 +index b2bcb8b0..347d239d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h -@@ -55,6 +55,8 @@ struct llama_hparams { +@@ -59,6 +59,8 @@ struct llama_hparams { std::array n_head_kv_arr; std::array n_ff_arr; @@ -127,7 +127,7 @@ index 7ee6a5b7..48dce407 100644 uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; uint32_t n_lora_kv = 0; -@@ -154,6 +156,9 @@ struct llama_hparams { +@@ -186,6 +188,9 @@ struct llama_hparams { // dimension of the recurrent state embeddings uint32_t n_embd_v_s() const; @@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644 }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp -index 4cce5166..7f6617fa 100644 +index ddb1b036..f4a6c2cd 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -439,6 +439,7 @@ namespace GGUFMeta { @@ -150,10 +150,10 @@ index 4cce5166..7f6617fa 100644 llama_model_loader::llama_model_loader( const std::string & fname, diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index 3a4e72a3..831b68c0 100644 +index afef8487..c042546c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp -@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -1417,6 +1417,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; @@ -175,7 +175,7 @@ index 3a4e72a3..831b68c0 100644 case LLM_ARCH_WAVTOKENIZER_DEC: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -@@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -3797,6 +3812,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); @@ -210,7 +210,7 @@ index 3a4e72a3..831b68c0 100644 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); -@@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context { +@@ -12721,6 +12764,165 @@ struct llm_build_chameleon : public llm_graph_context { } }; @@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644 + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models -+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); ++ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -376,7 +376,7 @@ index 3a4e72a3..831b68c0 100644 struct llm_build_wavtokenizer_dec : public llm_graph_context { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { ggml_tensor * cur; -@@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph( +@@ -13515,6 +13717,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; @@ -387,7 +387,7 @@ index 3a4e72a3..831b68c0 100644 case LLM_ARCH_WAVTOKENIZER_DEC: { llm = std::make_unique(*this, params, gf); -@@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { +@@ -13663,6 +13869,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_CHAMELEON: @@ -396,7 +396,7 @@ index 3a4e72a3..831b68c0 100644 return LLAMA_ROPE_TYPE_NORM; diff --git a/src/llama-model.h b/src/llama-model.h -index 6bdec263..43746c7d 100644 +index cbea2cb3..43e7fcda 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -65,6 +65,7 @@ enum llm_type { diff --git a/llama/patches/0006-fix-deepseek-deseret-regex.patch b/llama/patches/0006-fix-deepseek-deseret-regex.patch index ff4b57577..998d5e76f 100644 --- a/llama/patches/0006-fix-deepseek-deseret-regex.patch +++ b/llama/patches/0006-fix-deepseek-deseret-regex.patch @@ -12,7 +12,7 @@ regex 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 806c1b3d..10f34d33 100644 +index 0d7ad157..d007039f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { diff --git a/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch new file mode 100644 index 000000000..182760fce --- /dev/null +++ b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch @@ -0,0 +1,22 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: jmorganca +Date: Tue, 8 Apr 2025 19:43:40 -0700 +Subject: [PATCH] maintain ordering for rules for grammar + +--- + common/json-schema-to-grammar.cpp | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp +index d38a74f9..2a8aeca6 100644 +--- a/common/json-schema-to-grammar.cpp ++++ b/common/json-schema-to-grammar.cpp +@@ -350,7 +350,7 @@ private: + friend std::string build_grammar(const std::function & cb, const common_grammar_options & options); + std::function _fetch_json; + bool _dotall; +- std::map _rules; ++ std::unordered_map _rules; + std::unordered_map _refs; + std::unordered_set _refs_being_resolved; + std::vector _errors; diff --git a/llama/patches/0009-sort-devices-by-score.patch b/llama/patches/0008-sort-devices-by-score.patch similarity index 100% rename from llama/patches/0009-sort-devices-by-score.patch rename to llama/patches/0008-sort-devices-by-score.patch diff --git a/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0009-add-phony-target-ggml-cpu-for-all-cpu-variants.patch similarity index 71% rename from llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch rename to llama/patches/0009-add-phony-target-ggml-cpu-for-all-cpu-variants.patch index 21c1fc42f..32fcc7ceb 100644 --- a/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch +++ b/llama/patches/0009-add-phony-target-ggml-cpu-for-all-cpu-variants.patch @@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants 1 file changed, 2 insertions(+) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt -index ddea5ad3..45918bf6 100644 +index 7dcb031f..770e18bc 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt -@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name) +@@ -282,6 +282,7 @@ function(ggml_add_cpu_backend_variant tag_name) endforeach() ggml_add_cpu_backend_variant_impl(${tag_name}) @@ -19,11 +19,11 @@ index ddea5ad3..45918bf6 100644 endfunction() ggml_add_backend(CPU) -@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS) +@@ -290,6 +291,7 @@ if (GGML_CPU_ALL_VARIANTS) if (NOT GGML_BACKEND_DL) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") endif() + add_custom_target(ggml-cpu) - ggml_add_cpu_backend_variant(x64) - ggml_add_cpu_backend_variant(sse42 SSE42) - ggml_add_cpu_backend_variant(sandybridge SSE42 AVX) + if (GGML_SYSTEM_ARCH STREQUAL "x86") + ggml_add_cpu_backend_variant(x64) + ggml_add_cpu_backend_variant(sse42 SSE42) diff --git a/llama/patches/0010-remove-amx.patch b/llama/patches/0010-remove-amx.patch new file mode 100644 index 000000000..1dcf58492 --- /dev/null +++ b/llama/patches/0010-remove-amx.patch @@ -0,0 +1,25 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: jmorganca +Date: Thu, 1 May 2025 15:05:08 -0700 +Subject: [PATCH] remove amx + +disable amx as it reduces performance on some systems +--- + ggml/src/CMakeLists.txt | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt +index 770e18bc..62f3dbf6 100644 +--- a/ggml/src/CMakeLists.txt ++++ b/ggml/src/CMakeLists.txt +@@ -300,10 +300,6 @@ if (GGML_CPU_ALL_VARIANTS) + ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) +- if (NOT MSVC) +- # MSVC doesn't support AMX +- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) +- endif() + else() + message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}") + endif() diff --git a/llama/patches/0012-fix-string-arr-kv-loading.patch b/llama/patches/0011-fix-string-arr-kv-loading.patch similarity index 94% rename from llama/patches/0012-fix-string-arr-kv-loading.patch rename to llama/patches/0011-fix-string-arr-kv-loading.patch index f879c50ee..20c348048 100644 --- a/llama/patches/0012-fix-string-arr-kv-loading.patch +++ b/llama/patches/0011-fix-string-arr-kv-loading.patch @@ -25,10 +25,10 @@ index 79ee2020..3efb22f0 100644 // get ith C string from array with given key_id GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp -index 381a9c7d..e45b453d 100644 +index a0a318a2..b3326b94 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp -@@ -777,10 +777,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id +@@ -794,10 +794,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) { GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); @@ -44,7 +44,7 @@ index 381a9c7d..e45b453d 100644 const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) { GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING); -@@ -874,7 +878,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) { +@@ -891,7 +895,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) { const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) { GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); @@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644 } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 10f34d33..9f5fd57b 100644 +index d007039f..4a6c3ad6 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { diff --git a/llama/patches/0011-remove-amx.patch b/llama/patches/0011-remove-amx.patch deleted file mode 100644 index 296a37612..000000000 --- a/llama/patches/0011-remove-amx.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Thu, 1 May 2025 15:05:08 -0700 -Subject: [PATCH] remove amx - -disable amx as it reduces performance on some systems ---- - ggml/src/CMakeLists.txt | 4 ---- - 1 file changed, 4 deletions(-) - -diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt -index 45918bf6..0beaed86 100644 ---- a/ggml/src/CMakeLists.txt -+++ b/ggml/src/CMakeLists.txt -@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS) - ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) - ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) - ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) -- if (NOT MSVC) -- # MSVC doesn't support AMX -- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) -- endif() - elseif (GGML_CPU) - ggml_add_cpu_backend_variant_impl("") - endif() diff --git a/llama/patches/0013-ollama-debug-tensor.patch b/llama/patches/0012-ollama-debug-tensor.patch similarity index 91% rename from llama/patches/0013-ollama-debug-tensor.patch rename to llama/patches/0012-ollama-debug-tensor.patch index 53d911277..098f4d5aa 100644 --- a/llama/patches/0013-ollama-debug-tensor.patch +++ b/llama/patches/0012-ollama-debug-tensor.patch @@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index a30e67f2..2462d2b8 100644 +index c7426df2..23441678 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -15,6 +15,8 @@ @@ -20,7 +20,7 @@ index a30e67f2..2462d2b8 100644 #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { +@@ -2873,6 +2875,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); diff --git a/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0013-add-ollama-vocab-for-grammar-support.patch similarity index 97% rename from llama/patches/0014-add-ollama-vocab-for-grammar-support.patch rename to llama/patches/0013-add-ollama-vocab-for-grammar-support.patch index ee81800e2..2d3731236 100644 --- a/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch +++ b/llama/patches/0013-add-ollama-vocab-for-grammar-support.patch @@ -10,7 +10,7 @@ Subject: [PATCH] add ollama vocab for grammar support 3 files changed, 58 insertions(+), 9 deletions(-) diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp -index 973b47ae..60d58236 100644 +index bed706bb..b51cee09 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -907,6 +907,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack( @@ -90,7 +90,7 @@ index 973b47ae..60d58236 100644 if (grammar.awaiting_trigger) { if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) { -@@ -1191,13 +1200,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token +@@ -1201,13 +1210,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token } } @@ -107,7 +107,7 @@ index 973b47ae..60d58236 100644 } llama_grammar_accept_str(grammar, piece); -@@ -1217,3 +1227,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string +@@ -1227,3 +1237,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece); } } @@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644 const char * grammar_root, bool lazy, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp -index 804b11e0..15a10ca8 100644 +index bfbf5fa2..11f93f42 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { diff --git a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch b/llama/patches/0014-add-argsort-and-cuda-copy-for-i32.patch similarity index 96% rename from llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch rename to llama/patches/0014-add-argsort-and-cuda-copy-for-i32.patch index b71295c76..7107cd049 100644 --- a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch +++ b/llama/patches/0014-add-argsort-and-cuda-copy-for-i32.patch @@ -10,10 +10,10 @@ Subject: [PATCH] add argsort and cuda copy for i32 3 files changed, 192 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp -index becdae07..7a44b6cf 100644 +index 08facb6d..aa5cf56b 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp -@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32( +@@ -6925,6 +6925,45 @@ static void ggml_compute_forward_argsort_f32( } } @@ -59,7 +59,7 @@ index becdae07..7a44b6cf 100644 void ggml_compute_forward_argsort( const ggml_compute_params * params, ggml_tensor * dst) { -@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort( +@@ -6936,6 +6975,10 @@ void ggml_compute_forward_argsort( { ggml_compute_forward_argsort_f32(params, dst); } break; @@ -195,10 +195,10 @@ index 607ded85..53b02634 100644 + } } diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu -index 2d46176e..47383486 100644 +index 2c55d214..90d95d32 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu -@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { +@@ -41,6 +41,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { *dsti = *xi; } @@ -212,7 +212,7 @@ index 2d46176e..47383486 100644 template static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, -@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in +@@ -71,6 +78,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in cpy_1(cx + x_offset, cdst + dst_offset); } @@ -257,7 +257,7 @@ index 2d46176e..47383486 100644 static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; block_q8_0 * dsti = (block_q8_0 *) cdsti; -@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg +@@ -643,6 +688,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); @@ -266,7 +266,7 @@ index 2d46176e..47383486 100644 } else { GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); -@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { +@@ -698,6 +745,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { return (void*) cpy_f32_f16; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { return (void*) cpy_f32_f16; diff --git a/llama/patches/0016-graph-memory-reporting-on-failure.patch b/llama/patches/0015-graph-memory-reporting-on-failure.patch similarity index 98% rename from llama/patches/0016-graph-memory-reporting-on-failure.patch rename to llama/patches/0015-graph-memory-reporting-on-failure.patch index 921882249..115c3ab21 100644 --- a/llama/patches/0016-graph-memory-reporting-on-failure.patch +++ b/llama/patches/0015-graph-memory-reporting-on-failure.patch @@ -134,10 +134,10 @@ index 5fd379f6..04812990 100644 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index 0ce73a99..be335e8c 100644 +index e8694e5c..36f11537 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp -@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe +@@ -1637,6 +1637,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); } diff --git a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch b/llama/patches/0016-ggml-Export-GPU-UUIDs.patch similarity index 92% rename from llama/patches/0017-ggml-Export-GPU-UUIDs.patch rename to llama/patches/0016-ggml-Export-GPU-UUIDs.patch index a2539034c..b56785a30 100644 --- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch +++ b/llama/patches/0016-ggml-Export-GPU-UUIDs.patch @@ -24,10 +24,10 @@ index 74e46716..a880df33 100644 size_t memory_total; enum ggml_backend_dev_type type; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index cb0d8528..4c829153 100644 +index ec031650..8d5edd04 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context { +@@ -2893,6 +2893,7 @@ struct ggml_backend_cuda_device_context { int device; std::string name; std::string description; @@ -35,7 +35,7 @@ index cb0d8528..4c829153 100644 }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { -@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t +@@ -2905,6 +2906,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t return ctx->description.c_str(); } @@ -47,7 +47,7 @@ index cb0d8528..4c829153 100644 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); -@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend +@@ -2919,6 +2925,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { props->name = ggml_backend_cuda_device_get_name(dev); props->description = ggml_backend_cuda_device_get_description(dev); @@ -55,7 +55,7 @@ index cb0d8528..4c829153 100644 props->type = ggml_backend_cuda_device_get_type(dev); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); -@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -3473,6 +3480,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); dev_ctx->description = prop.name; @@ -89,10 +89,10 @@ index cb0d8528..4c829153 100644 /* .iface = */ ggml_backend_cuda_device_interface, /* .reg = */ ®, diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 1b56f858..ee4f2dcb 100644 +index fd3a9d1b..884bde80 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen +@@ -5761,6 +5761,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { props->name = ggml_backend_metal_device_get_name(dev); props->description = ggml_backend_metal_device_get_description(dev);