Update to B7618

2026-01-03 12:54:46 +01:00 · 2026-01-03 12:54:46 +01:00 · 7f25eb0038
parent 47a8e00686
commit 7f25eb0038
5 changed files with 6 additions and 6 deletions
--- a/Makefile.sync
+++ b/Makefile.sync
@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=18ddaea2aecf7fbfe7acab77465808f3cf6200d3
+FETCH_HEAD=9dba9f5352308894bfb8786fcfe7c284168ff8f5

 .PHONY: help
 help:
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "18ddaea2aecf7fbfe7acab77465808f3cf6200d3";
+char const *LLAMA_COMMIT = "9dba9f5352308894bfb8786fcfe7c284168ff8f5";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
@ -73,7 +73,7 @@ index 303278397..7d1733adb 100644
             {
                 GGML_ABORT("fatal error");
 diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
-index da9652c3b..b82be371c 100644
+index 99669200f..5e261fd2d 100644
 --- a/ggml/src/ggml-cuda/argsort.cu
 +++ b/ggml/src/ggml-cuda/argsort.cu
@@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float *   x,
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
@ -29,8 +29,8 @@ static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                                     const int        nrows,
                                     ggml_sort_order  order,
                                     cudaStream_t     stream) {
-    ggml_cuda_pool_alloc<int>   temp_indices_alloc(pool, ncols * nrows);
-    ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
+    ggml_cuda_pool_alloc<int>   temp_indices_alloc(pool, ((size_t) ncols) * nrows);
+    ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ((size_t) ncols) * nrows);
    ggml_cuda_pool_alloc<int>   offsets_alloc(pool, nrows + 1);

    int *   temp_indices = temp_indices_alloc.get();
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
@ -918,7 +918,7 @@ void launch_fattn(
        blocks_num.y = 1;
        blocks_num.z = 1;

-        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
+        dst_tmp_meta.alloc(((size_t) blocks_num.x) * ncols * (2 + DV/2));
    } else {
        const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.