From 7f25eb00389e999ac78c51b00db342b79941616f Mon Sep 17 00:00:00 2001
From: inforithmics <thomas.stocker@gmail.com>
Date: Sat, 3 Jan 2026 12:54:46 +0100
Subject: [PATCH] Update to B7618

---
 Makefile.sync                                              | 2 +-
 llama/build-info.cpp                                       | 2 +-
 llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch | 2 +-
 ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu              | 4 ++--
 ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh        | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/Makefile.sync b/Makefile.sync
index 4be4d203c..a670fd9f4 100644
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=18ddaea2aecf7fbfe7acab77465808f3cf6200d3
+FETCH_HEAD=9dba9f5352308894bfb8786fcfe7c284168ff8f5
 
 .PHONY: help
 help:
diff --git a/llama/build-info.cpp b/llama/build-info.cpp
index f497f0107..0df09676c 100644
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "18ddaea2aecf7fbfe7acab77465808f3cf6200d3";
+char const *LLAMA_COMMIT = "9dba9f5352308894bfb8786fcfe7c284168ff8f5";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
diff --git a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
index cce1466a5..46d743322 100644
--- a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
@@ -73,7 +73,7 @@ index 303278397..7d1733adb 100644
              {
                  GGML_ABORT("fatal error");
 diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
-index da9652c3b..b82be371c 100644
+index 99669200f..5e261fd2d 100644
 --- a/ggml/src/ggml-cuda/argsort.cu
 +++ b/ggml/src/ggml-cuda/argsort.cu
 @@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float *   x,
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
index b82be371c..5e261fd2d 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
@@ -29,8 +29,8 @@ static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                                      const int        nrows,
                                      ggml_sort_order  order,
                                      cudaStream_t     stream) {
-    ggml_cuda_pool_alloc<int>   temp_indices_alloc(pool, ncols * nrows);
-    ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
+    ggml_cuda_pool_alloc<int>   temp_indices_alloc(pool, ((size_t) ncols) * nrows);
+    ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ((size_t) ncols) * nrows);
     ggml_cuda_pool_alloc<int>   offsets_alloc(pool, nrows + 1);
 
     int *   temp_indices = temp_indices_alloc.get();
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
index 8dc82a9d3..fa4e87ee4 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
@@ -918,7 +918,7 @@ void launch_fattn(
         blocks_num.y = 1;
         blocks_num.z = 1;
 
-        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
+        dst_tmp_meta.alloc(((size_t) blocks_num.x) * ncols * (2 + DV/2));
     } else {
         const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.