From 7f25eb00389e999ac78c51b00db342b79941616f Mon Sep 17 00:00:00 2001 From: inforithmics Date: Sat, 3 Jan 2026 12:54:46 +0100 Subject: [PATCH] Update to B7618 --- Makefile.sync | 2 +- llama/build-info.cpp | 2 +- llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch | 2 +- ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu | 4 ++-- ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile.sync b/Makefile.sync index 4be4d203c..a670fd9f4 100644 --- a/Makefile.sync +++ b/Makefile.sync @@ -1,6 +1,6 @@ UPSTREAM=https://github.com/ggml-org/llama.cpp.git WORKDIR=llama/vendor -FETCH_HEAD=18ddaea2aecf7fbfe7acab77465808f3cf6200d3 +FETCH_HEAD=9dba9f5352308894bfb8786fcfe7c284168ff8f5 .PHONY: help help: diff --git a/llama/build-info.cpp b/llama/build-info.cpp index f497f0107..0df09676c 100644 --- a/llama/build-info.cpp +++ b/llama/build-info.cpp @@ -1,4 +1,4 @@ int LLAMA_BUILD_NUMBER = 0; -char const *LLAMA_COMMIT = "18ddaea2aecf7fbfe7acab77465808f3cf6200d3"; +char const *LLAMA_COMMIT = "9dba9f5352308894bfb8786fcfe7c284168ff8f5"; char const *LLAMA_COMPILER = ""; char const *LLAMA_BUILD_TARGET = ""; diff --git a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch index cce1466a5..46d743322 100644 --- a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch +++ b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch @@ -73,7 +73,7 @@ index 303278397..7d1733adb 100644 { GGML_ABORT("fatal error"); diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu -index da9652c3b..b82be371c 100644 +index 99669200f..5e261fd2d 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float * x, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu index b82be371c..5e261fd2d 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu @@ -29,8 +29,8 @@ static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, const int nrows, ggml_sort_order order, cudaStream_t stream) { - ggml_cuda_pool_alloc temp_indices_alloc(pool, ncols * nrows); - ggml_cuda_pool_alloc temp_keys_alloc(pool, ncols * nrows); + ggml_cuda_pool_alloc temp_indices_alloc(pool, ((size_t) ncols) * nrows); + ggml_cuda_pool_alloc temp_keys_alloc(pool, ((size_t) ncols) * nrows); ggml_cuda_pool_alloc offsets_alloc(pool, nrows + 1); int * temp_indices = temp_indices_alloc.get(); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh index 8dc82a9d3..fa4e87ee4 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh @@ -918,7 +918,7 @@ void launch_fattn( blocks_num.y = 1; blocks_num.z = 1; - dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float)); + dst_tmp_meta.alloc(((size_t) blocks_num.x) * ncols * (2 + DV/2)); } else { const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.