From f5d663e370a7bdb0acda66a448f21cc8f96f9b83 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 7 Jun 2025 12:36:59 -0400
Subject: [PATCH] update patches

---
 ...loc-and-free-using-the-same-compiler.patch | 38 +++++++++---------
 llama/patches/0002-pretokenizer.patch         |  2 +-
 llama/patches/0003-embeddings.patch           |  8 ++--
 llama/patches/0004-clip-unicode.patch         |  8 ++--
 llama/patches/0005-solar-pro.patch            | 40 +++++++++----------
 .../0006-fix-deepseek-deseret-regex.patch     |  2 +-
 ...ntain-ordering-for-rules-for-grammar.patch | 22 ++++++++++
 ...patch => 0008-sort-devices-by-score.patch} |  0
 ...arget-ggml-cpu-for-all-cpu-variants.patch} | 12 +++---
 llama/patches/0010-remove-amx.patch           | 25 ++++++++++++
 ...h => 0011-fix-string-arr-kv-loading.patch} |  8 ++--
 llama/patches/0011-remove-amx.patch           | 25 ------------
 ...r.patch => 0012-ollama-debug-tensor.patch} |  4 +-
 ...dd-ollama-vocab-for-grammar-support.patch} |  8 ++--
 ...4-add-argsort-and-cuda-copy-for-i32.patch} | 16 ++++----
 ...5-graph-memory-reporting-on-failure.patch} |  4 +-
 ...patch => 0016-ggml-Export-GPU-UUIDs.patch} | 14 +++----
 17 files changed, 129 insertions(+), 107 deletions(-)
 create mode 100644 llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch
 rename llama/patches/{0009-sort-devices-by-score.patch => 0008-sort-devices-by-score.patch} (100%)
 rename llama/patches/{0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch => 0009-add-phony-target-ggml-cpu-for-all-cpu-variants.patch} (71%)
 create mode 100644 llama/patches/0010-remove-amx.patch
 rename llama/patches/{0012-fix-string-arr-kv-loading.patch => 0011-fix-string-arr-kv-loading.patch} (94%)
 delete mode 100644 llama/patches/0011-remove-amx.patch
 rename llama/patches/{0013-ollama-debug-tensor.patch => 0012-ollama-debug-tensor.patch} (91%)
 rename llama/patches/{0014-add-ollama-vocab-for-grammar-support.patch => 0013-add-ollama-vocab-for-grammar-support.patch} (97%)
 rename llama/patches/{0015-add-argsort-and-cuda-copy-for-i32.patch => 0014-add-argsort-and-cuda-copy-for-i32.patch} (96%)
 rename llama/patches/{0016-graph-memory-reporting-on-failure.patch => 0015-graph-memory-reporting-on-failure.patch} (98%)
 rename llama/patches/{0017-ggml-Export-GPU-UUIDs.patch => 0016-ggml-Export-GPU-UUIDs.patch} (92%)

diff --git a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
index edeeb4ffa..4f569c8f3 100644
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -24,7 +24,7 @@ problem.
  9 files changed, 21 insertions(+), 2 deletions(-)
 
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index b30b4cb3..0ce73a99 100644
+index b1050ad5..e8694e5c 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
 @@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -43,7 +43,7 @@ index b30b4cb3..0ce73a99 100644
  }
  
  static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -1879,6 +1879,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
  
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_aligned_free(buffer->context, buffer->size);
@@ -55,7 +55,7 @@ index b30b4cb3..0ce73a99 100644
  }
  
  static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -1926,7 +1931,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
  };
  
  static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -65,10 +65,10 @@ index b30b4cb3..0ce73a99 100644
      /* .init_tensor     = */ NULL, // no initialization required
      /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index e2617b06..242e50a7 100644
+index c0ea2600..6c3398da 100755
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
-@@ -800,6 +800,7 @@ static void ggml_backend_cann_buffer_free_buffer(
+@@ -801,6 +801,7 @@ static void ggml_backend_cann_buffer_free_buffer(
      ggml_backend_cann_buffer_context* ctx =
          (ggml_backend_cann_buffer_context*)buffer->context;
      delete ctx;
@@ -76,7 +76,7 @@ index e2617b06..242e50a7 100644
  }
  
  /**
-@@ -1472,6 +1473,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1473,6 +1474,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
   */
  static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
      ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
  
  /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index b4b85abc..cb0d8528 100644
+index 2a6f7f10..ec031650 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -104,7 +104,7 @@ index b4b85abc..cb0d8528 100644
  }
  
  static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
+@@ -1071,6 +1073,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
  
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
  
  static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 576f9581..1b56f858 100644
+index bc93bc63..fd3a9d1b 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -5272,6 +5272,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
      }
  
      free(ctx);
@@ -137,10 +137,10 @@ index 576f9581..1b56f858 100644
  
  static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index 05a2f4e6..392cc18d 100644
+index 80a36438..6abb0ab2 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -1940,6 +1940,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -2366,6 +2366,7 @@ struct ggml_backend_opencl_buffer_context {
  static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
      delete ctx;
@@ -161,10 +161,10 @@ index 4f0abb5a..de1ec184 100644
  
  static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 0ea72994..ae3a3c33 100644
+index 78513114..0dabdfe7 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
      ggml_sycl_set_device(ctx->device);
  
      delete ctx;
@@ -172,7 +172,7 @@ index 0ea72994..ae3a3c33 100644
  }
  catch (sycl::exception const &exc) {
    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -791,6 +792,7 @@ struct ggml_backend_sycl_split_buffer_context {
  static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
      delete ctx;
@@ -180,7 +180,7 @@ index 0ea72994..ae3a3c33 100644
  }
  
  static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1133,6 +1135,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
  
  static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_sycl_host_free(buffer->context);
@@ -189,10 +189,10 @@ index 0ea72994..ae3a3c33 100644
  
  static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index e2b357fd..68768029 100644
+index 3e43b03b..01776f3d 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -9272,6 +9272,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
      ggml_vk_destroy_buffer(ctx->dev_buffer);
      delete ctx;
@@ -200,7 +200,7 @@ index e2b357fd..68768029 100644
  }
  
  static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -9415,6 +9416,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
  static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
      ggml_vk_host_free(vk_instance.devices[0], buffer->context);
diff --git a/llama/patches/0002-pretokenizer.patch b/llama/patches/0002-pretokenizer.patch
index 07aa4b0ea..3caf5287a 100644
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,7 +10,7 @@ logs instead of throwing an error
  1 file changed, 3 insertions(+), 11 deletions(-)
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 9389ca80..806c1b3d 100644
+index ba2e1864..0d7ad157 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
 @@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
diff --git a/llama/patches/0003-embeddings.patch b/llama/patches/0003-embeddings.patch
index 80d6b55e5..246cd2919 100644
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -11,10 +11,10 @@ instead of forcing one or the error
  1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 62246c10..dca22d8b 100644
+index c29fe7e4..148d1132 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -952,7 +952,7 @@ int llama_context::decode(llama_batch & inp_batch) {
      int64_t n_outputs_all = 0;
  
      // count outputs
@@ -23,7 +23,7 @@ index 62246c10..dca22d8b 100644
          for (uint32_t i = 0; i < n_tokens_all; ++i) {
              n_outputs_all += batch.logits[i] != 0;
          }
-@@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1083,7 +1083,7 @@ int llama_context::decode(llama_batch & inp_batch) {
          //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
          //}
  
@@ -32,7 +32,7 @@ index 62246c10..dca22d8b 100644
          auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
  
          if (t_embd && res->get_embd_pooled()) {
-@@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1244,7 +1244,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
      const auto n_embd  = hparams.n_embd;
  
      // TODO: use a per-batch flag for logits presence instead
diff --git a/llama/patches/0004-clip-unicode.patch b/llama/patches/0004-clip-unicode.patch
index 957109783..5a3b1e43d 100644
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -10,10 +10,10 @@ filesystems for paths that include wide characters
  1 file changed, 39 insertions(+)
 
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index 41ba45a7..cdd8ca44 100644
+index c25bacc1..b3f92814 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
-@@ -31,6 +31,19 @@
+@@ -28,6 +28,19 @@
  #include <numeric>
  #include <functional>
  
@@ -33,7 +33,7 @@ index 41ba45a7..cdd8ca44 100644
  struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
  
  enum ffn_op_type {
-@@ -2190,7 +2203,29 @@ struct clip_model_loader {
+@@ -2552,7 +2565,29 @@ struct clip_model_loader {
          {
              std::vector<uint8_t> read_buf;
  
@@ -63,7 +63,7 @@ index 41ba45a7..cdd8ca44 100644
              if (!fin) {
                  throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
              }
-@@ -2217,7 +2252,11 @@ struct clip_model_loader {
+@@ -2579,7 +2614,11 @@ struct clip_model_loader {
                      ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                  }
              }
diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch
index deb53c225..be6ec4c29 100644
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
  7 files changed, 248 insertions(+)
 
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index f2bc8ca7..5ab3f572 100644
+index c0590e10..6d9f0719 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
 @@ -69,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -34,7 +34,7 @@ index f2bc8ca7..5ab3f572 100644
      { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
      { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
  
-@@ -1502,6 +1504,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1508,6 +1510,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
              { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
          },
      },
@@ -59,7 +59,7 @@ index f2bc8ca7..5ab3f572 100644
      {
          LLM_ARCH_WAVTOKENIZER_DEC,
          {
-@@ -1680,6 +1700,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1686,6 +1706,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
      {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
      // this tensor is loaded for T5, but never used
      {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -68,7 +68,7 @@ index f2bc8ca7..5ab3f572 100644
      {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
      {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 41a023da..525c1b7d 100644
+index 930cb4ec..591bc14e 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
 @@ -73,6 +73,7 @@ enum llm_arch {
@@ -87,7 +87,7 @@ index 41a023da..525c1b7d 100644
      LLM_KV_ATTENTION_KEY_LENGTH_MLA,
      LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
  
-@@ -346,6 +348,7 @@ enum llm_tensor {
+@@ -348,6 +350,7 @@ enum llm_tensor {
      LLM_TENSOR_ENC_OUTPUT_NORM,
      LLM_TENSOR_CLS,
      LLM_TENSOR_CLS_OUT,
@@ -96,10 +96,10 @@ index 41a023da..525c1b7d 100644
      LLM_TENSOR_CONVNEXT_DW,
      LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index 90dfe7a7..8a667960 100644
+index 1499eb08..aa7a4b23 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -70,6 +70,14 @@ uint32_t llama_hparams::n_embd_v_s() const {
+@@ -86,6 +86,14 @@ uint32_t llama_hparams::n_embd_v_s() const {
      return ssm_d_state * ssm_d_inner;
  }
  
@@ -113,12 +113,12 @@ index 90dfe7a7..8a667960 100644
 +
  bool llama_hparams::is_swa(uint32_t il) const {
      if (il < n_layer) {
-         return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
+         return swa_layers[il];
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 7ee6a5b7..48dce407 100644
+index b2bcb8b0..347d239d 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -55,6 +55,8 @@ struct llama_hparams {
+@@ -59,6 +59,8 @@ struct llama_hparams {
      std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
      std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
  
@@ -127,7 +127,7 @@ index 7ee6a5b7..48dce407 100644
      uint32_t n_layer_dense_lead = 0;
      uint32_t n_lora_q           = 0;
      uint32_t n_lora_kv          = 0;
-@@ -154,6 +156,9 @@ struct llama_hparams {
+@@ -186,6 +188,9 @@ struct llama_hparams {
      // dimension of the recurrent state embeddings
      uint32_t n_embd_v_s() const;
  
@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
  };
  
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 4cce5166..7f6617fa 100644
+index ddb1b036..f4a6c2cd 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
 @@ -439,6 +439,7 @@ namespace GGUFMeta {
@@ -150,10 +150,10 @@ index 4cce5166..7f6617fa 100644
  llama_model_loader::llama_model_loader(
          const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 3a4e72a3..831b68c0 100644
+index afef8487..c042546c 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1417,6 +1417,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      default: type = LLM_TYPE_UNKNOWN;
                 }
              } break;
@@ -175,7 +175,7 @@ index 3a4e72a3..831b68c0 100644
          case LLM_ARCH_WAVTOKENIZER_DEC:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3797,6 +3812,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
                          layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  
@@ -210,7 +210,7 @@ index 3a4e72a3..831b68c0 100644
                          layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                          layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                          layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context {
+@@ -12721,6 +12764,165 @@ struct llm_build_chameleon : public llm_graph_context {
      }
  };
  
@@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
++                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
 +
 +                // compute Q and K and RoPE them
 +                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -376,7 +376,7 @@ index 3a4e72a3..831b68c0 100644
  struct llm_build_wavtokenizer_dec : public llm_graph_context {
      llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
          ggml_tensor * cur;
-@@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -13515,6 +13717,10 @@ llm_graph_result_ptr llama_model::build_graph(
              {
                  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
              } break;
@@ -387,7 +387,7 @@ index 3a4e72a3..831b68c0 100644
          case LLM_ARCH_WAVTOKENIZER_DEC:
              {
                  llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
-@@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13663,6 +13869,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
          case LLM_ARCH_GRANITE:
          case LLM_ARCH_GRANITE_MOE:
          case LLM_ARCH_CHAMELEON:
@@ -396,7 +396,7 @@ index 3a4e72a3..831b68c0 100644
              return LLAMA_ROPE_TYPE_NORM;
  
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 6bdec263..43746c7d 100644
+index cbea2cb3..43e7fcda 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
 @@ -65,6 +65,7 @@ enum llm_type {
diff --git a/llama/patches/0006-fix-deepseek-deseret-regex.patch b/llama/patches/0006-fix-deepseek-deseret-regex.patch
index ff4b57577..998d5e76f 100644
--- a/llama/patches/0006-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0006-fix-deepseek-deseret-regex.patch
@@ -12,7 +12,7 @@ regex
  2 files changed, 22 insertions(+), 1 deletion(-)
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 806c1b3d..10f34d33 100644
+index 0d7ad157..d007039f 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
 @@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
diff --git a/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch
new file mode 100644
index 000000000..182760fce
--- /dev/null
+++ b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch
@@ -0,0 +1,22 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 19:43:40 -0700
+Subject: [PATCH] maintain ordering for rules for grammar
+
+---
+ common/json-schema-to-grammar.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
+index d38a74f9..2a8aeca6 100644
+--- a/common/json-schema-to-grammar.cpp
++++ b/common/json-schema-to-grammar.cpp
+@@ -350,7 +350,7 @@ private:
+     friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
+     std::function<json(const std::string &)> _fetch_json;
+     bool _dotall;
+-    std::map<std::string, std::string> _rules;
++    std::unordered_map<std::string, std::string> _rules;
+     std::unordered_map<std::string, json> _refs;
+     std::unordered_set<std::string> _refs_being_resolved;
+     std::vector<std::string> _errors;
diff --git a/llama/patches/0009-sort-devices-by-score.patch b/llama/patches/0008-sort-devices-by-score.patch
similarity index 100%
rename from llama/patches/0009-sort-devices-by-score.patch
rename to llama/patches/0008-sort-devices-by-score.patch
diff --git a/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0009-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
similarity index 71%
rename from llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
rename to llama/patches/0009-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
index 21c1fc42f..32fcc7ceb 100644
--- a/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0009-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
  1 file changed, 2 insertions(+)
 
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index ddea5ad3..45918bf6 100644
+index 7dcb031f..770e18bc 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -282,6 +282,7 @@ function(ggml_add_cpu_backend_variant tag_name)
      endforeach()
  
      ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,11 +19,11 @@ index ddea5ad3..45918bf6 100644
  endfunction()
  
  ggml_add_backend(CPU)
-@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -290,6 +291,7 @@ if (GGML_CPU_ALL_VARIANTS)
      if (NOT GGML_BACKEND_DL)
          message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
      endif()
 +    add_custom_target(ggml-cpu)
-     ggml_add_cpu_backend_variant(x64)
-     ggml_add_cpu_backend_variant(sse42        SSE42)
-     ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
+     if (GGML_SYSTEM_ARCH STREQUAL "x86")
+         ggml_add_cpu_backend_variant(x64)
+         ggml_add_cpu_backend_variant(sse42        SSE42)
diff --git a/llama/patches/0010-remove-amx.patch b/llama/patches/0010-remove-amx.patch
new file mode 100644
index 000000000..1dcf58492
--- /dev/null
+++ b/llama/patches/0010-remove-amx.patch
@@ -0,0 +1,25 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Thu, 1 May 2025 15:05:08 -0700
+Subject: [PATCH] remove amx
+
+disable amx as it reduces performance on some systems
+---
+ ggml/src/CMakeLists.txt | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 770e18bc..62f3dbf6 100644
+--- a/ggml/src/CMakeLists.txt
++++ b/ggml/src/CMakeLists.txt
+@@ -300,10 +300,6 @@ if (GGML_CPU_ALL_VARIANTS)
+         ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
+         ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+         ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+-        if (NOT MSVC)
+-            # MSVC doesn't support AMX
+-            ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+-        endif()
+     else()
+         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
+     endif()
diff --git a/llama/patches/0012-fix-string-arr-kv-loading.patch b/llama/patches/0011-fix-string-arr-kv-loading.patch
similarity index 94%
rename from llama/patches/0012-fix-string-arr-kv-loading.patch
rename to llama/patches/0011-fix-string-arr-kv-loading.patch
index f879c50ee..20c348048 100644
--- a/llama/patches/0012-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0011-fix-string-arr-kv-loading.patch
@@ -25,10 +25,10 @@ index 79ee2020..3efb22f0 100644
      // get ith C string from array with given key_id
      GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
 diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
-index 381a9c7d..e45b453d 100644
+index a0a318a2..b3326b94 100644
 --- a/ggml/src/gguf.cpp
 +++ b/ggml/src/gguf.cpp
-@@ -777,10 +777,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
+@@ -794,10 +794,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
  
  const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) {
      GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
@@ -44,7 +44,7 @@ index 381a9c7d..e45b453d 100644
  const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) {
      GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
      GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING);
-@@ -874,7 +878,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) {
+@@ -891,7 +895,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) {
  const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) {
      GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
      GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
  }
  
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 10f34d33..9f5fd57b 100644
+index d007039f..4a6c3ad6 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
 @@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
diff --git a/llama/patches/0011-remove-amx.patch b/llama/patches/0011-remove-amx.patch
deleted file mode 100644
index 296a37612..000000000
--- a/llama/patches/0011-remove-amx.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Thu, 1 May 2025 15:05:08 -0700
-Subject: [PATCH] remove amx
-
-disable amx as it reduces performance on some systems
----
- ggml/src/CMakeLists.txt | 4 ----
- 1 file changed, 4 deletions(-)
-
-diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 45918bf6..0beaed86 100644
---- a/ggml/src/CMakeLists.txt
-+++ b/ggml/src/CMakeLists.txt
-@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
-     ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
-     ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-     ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
--    if (NOT MSVC)
--        # MSVC doesn't support AMX
--        ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
--    endif()
- elseif (GGML_CPU)
-     ggml_add_cpu_backend_variant_impl("")
- endif()
diff --git a/llama/patches/0013-ollama-debug-tensor.patch b/llama/patches/0012-ollama-debug-tensor.patch
similarity index 91%
rename from llama/patches/0013-ollama-debug-tensor.patch
rename to llama/patches/0012-ollama-debug-tensor.patch
index 53d911277..098f4d5aa 100644
--- a/llama/patches/0013-ollama-debug-tensor.patch
+++ b/llama/patches/0012-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
  1 file changed, 6 insertions(+)
 
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index a30e67f2..2462d2b8 100644
+index c7426df2..23441678 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
 @@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index a30e67f2..2462d2b8 100644
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <malloc.h> // using malloc.h with MSC/MINGW
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2873,6 +2875,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  
          ggml_compute_forward(&params, node);
  
diff --git a/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0013-add-ollama-vocab-for-grammar-support.patch
similarity index 97%
rename from llama/patches/0014-add-ollama-vocab-for-grammar-support.patch
rename to llama/patches/0013-add-ollama-vocab-for-grammar-support.patch
index ee81800e2..2d3731236 100644
--- a/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0013-add-ollama-vocab-for-grammar-support.patch
@@ -10,7 +10,7 @@ Subject: [PATCH] add ollama vocab for grammar support
  3 files changed, 58 insertions(+), 9 deletions(-)
 
 diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
-index 973b47ae..60d58236 100644
+index bed706bb..b51cee09 100644
 --- a/src/llama-grammar.cpp
 +++ b/src/llama-grammar.cpp
 @@ -907,6 +907,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@@ -90,7 +90,7 @@ index 973b47ae..60d58236 100644
  
      if (grammar.awaiting_trigger) {
          if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
-@@ -1191,13 +1200,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
+@@ -1201,13 +1210,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
          }
      }
  
@@ -107,7 +107,7 @@ index 973b47ae..60d58236 100644
      }
  
      llama_grammar_accept_str(grammar, piece);
-@@ -1217,3 +1227,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
+@@ -1227,3 +1237,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
          throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
      }
  }
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
                        const char * grammar_root,
                                bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index 804b11e0..15a10ca8 100644
+index bfbf5fa2..11f93f42 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
 @@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
diff --git a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch b/llama/patches/0014-add-argsort-and-cuda-copy-for-i32.patch
similarity index 96%
rename from llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
rename to llama/patches/0014-add-argsort-and-cuda-copy-for-i32.patch
index b71295c76..7107cd049 100644
--- a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0014-add-argsort-and-cuda-copy-for-i32.patch
@@ -10,10 +10,10 @@ Subject: [PATCH] add argsort and cuda copy for i32
  3 files changed, 192 insertions(+), 2 deletions(-)
 
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index becdae07..7a44b6cf 100644
+index 08facb6d..aa5cf56b 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
+@@ -6925,6 +6925,45 @@ static void ggml_compute_forward_argsort_f32(
      }
  }
  
@@ -59,7 +59,7 @@ index becdae07..7a44b6cf 100644
  void ggml_compute_forward_argsort(
      const ggml_compute_params * params,
      ggml_tensor * dst) {
-@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
+@@ -6936,6 +6975,10 @@ void ggml_compute_forward_argsort(
              {
                  ggml_compute_forward_argsort_f32(params, dst);
              } break;
@@ -195,10 +195,10 @@ index 607ded85..53b02634 100644
 +    }
  }
 diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index 2d46176e..47383486 100644
+index 2c55d214..90d95d32 100644
 --- a/ggml/src/ggml-cuda/cpy.cu
 +++ b/ggml/src/ggml-cuda/cpy.cu
-@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+@@ -41,6 +41,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
      *dsti = *xi;
  }
  
@@ -212,7 +212,7 @@ index 2d46176e..47383486 100644
  template <cpy_kernel_t cpy_1>
  static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
+@@ -71,6 +78,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
      cpy_1(cx + x_offset, cdst + dst_offset);
  }
  
@@ -257,7 +257,7 @@ index 2d46176e..47383486 100644
  static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
      const float * xi = (const float *) cxi;
      block_q8_0 * dsti = (block_q8_0 *) cdsti;
-@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+@@ -643,6 +688,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
          ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
      } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
          ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -266,7 +266,7 @@ index 2d46176e..47383486 100644
      } else {
          GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                  ggml_type_name(src0->type), ggml_type_name(src1->type));
-@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
+@@ -698,6 +745,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
          return (void*) cpy_f32_f16<cpy_1_f32_f16>;
      } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
          return (void*) cpy_f32_f16<cpy_1_f16_f32>;
diff --git a/llama/patches/0016-graph-memory-reporting-on-failure.patch b/llama/patches/0015-graph-memory-reporting-on-failure.patch
similarity index 98%
rename from llama/patches/0016-graph-memory-reporting-on-failure.patch
rename to llama/patches/0015-graph-memory-reporting-on-failure.patch
index 921882249..115c3ab21 100644
--- a/llama/patches/0016-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0015-graph-memory-reporting-on-failure.patch
@@ -134,10 +134,10 @@ index 5fd379f6..04812990 100644
  
  static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 0ce73a99..be335e8c 100644
+index e8694e5c..36f11537 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
+@@ -1637,6 +1637,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
      return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
  }
  
diff --git a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch b/llama/patches/0016-ggml-Export-GPU-UUIDs.patch
similarity index 92%
rename from llama/patches/0017-ggml-Export-GPU-UUIDs.patch
rename to llama/patches/0016-ggml-Export-GPU-UUIDs.patch
index a2539034c..b56785a30 100644
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0016-ggml-Export-GPU-UUIDs.patch
@@ -24,10 +24,10 @@ index 74e46716..a880df33 100644
          size_t memory_total;
          enum ggml_backend_dev_type type;
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..4c829153 100644
+index ec031650..8d5edd04 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
+@@ -2893,6 +2893,7 @@ struct ggml_backend_cuda_device_context {
      int device;
      std::string name;
      std::string description;
@@ -35,7 +35,7 @@ index cb0d8528..4c829153 100644
  };
  
  static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
+@@ -2905,6 +2906,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
      return ctx->description.c_str();
  }
  
@@ -47,7 +47,7 @@ index cb0d8528..4c829153 100644
  static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
      ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
      ggml_cuda_set_device(ctx->device);
-@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+@@ -2919,6 +2925,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
  static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
      props->name        = ggml_backend_cuda_device_get_name(dev);
      props->description = ggml_backend_cuda_device_get_description(dev);
@@ -55,7 +55,7 @@ index cb0d8528..4c829153 100644
      props->type        = ggml_backend_cuda_device_get_type(dev);
      ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
  
-@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -3473,6 +3480,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                  CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                  dev_ctx->description = prop.name;
  
@@ -89,10 +89,10 @@ index cb0d8528..4c829153 100644
                      /* .iface   = */ ggml_backend_cuda_device_interface,
                      /* .reg     = */ &reg,
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..ee4f2dcb 100644
+index fd3a9d1b..884bde80 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+@@ -5761,6 +5761,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
  static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
      props->name        = ggml_backend_metal_device_get_name(dev);
      props->description = ggml_backend_metal_device_get_description(dev);