diff --git a/docs/api.md b/docs/api.md index 41858885b..683db3573 100644 --- a/docs/api.md +++ b/docs/api.md @@ -500,11 +500,11 @@ The `message` object has the following fields: - `thinking`: (for thinking models) the model's thinking process - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`) - `tool_calls` (optional): a list of tools in JSON that the model wants to use -- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result +- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result Advanced parameters (optional): -- `format`: the format to return a response in. Format can be `json` or a JSON schema. +- `format`: the format to return a response in. Format can be `json` or a JSON schema. - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) diff --git a/docs/development.md b/docs/development.md index 24bcba194..9726b5d91 100644 --- a/docs/development.md +++ b/docs/development.md @@ -118,7 +118,7 @@ To run tests, use `go test`: go test ./... ``` -> NOTE: In rare cirumstances, you may need to change a package using the new +> NOTE: In rare circumstances, you may need to change a package using the new > "synctest" package in go1.24. > > If you do not have the "synctest" package enabled, you will not see build or diff --git a/docs/openai.md b/docs/openai.md index d0bac4cd3..26930124c 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -72,7 +72,7 @@ client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama") # Define the schema for the response class FriendInfo(BaseModel): name: str - age: int + age: int is_available: bool class FriendList(BaseModel): diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 995b33aca..6fdd3e85b 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log On **Linux** systems with systemd, the logs can be found with this command: ```shell -journalctl -u ollama --no-pager --follow --pager-end +journalctl -u ollama --no-pager --follow --pager-end ``` When you run Ollama in a **container**, the logs go to stdout/stderr in the container: @@ -23,7 +23,7 @@ docker logs If manually running `ollama serve` in a terminal, the logs will be on that terminal. When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `+R` and type in: -- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log` +- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log` - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH) - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored @@ -38,7 +38,7 @@ Join the [Discord](https://discord.gg/ollama) for help interpreting the logs. ## LLM libraries -Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. +Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` and the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. In the server log, you will see a message that looks something like this (varies from release to release): @@ -97,7 +97,7 @@ If none of those resolve the problem, gather additional information and file an On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log. -When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. For example, in the following output `crw-rw---- 1 0 44 226, 0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` +When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. For example, in the following output `crw-rw---- 1 0 44 226, 0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure. - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems diff --git a/kvcache/causal.go b/kvcache/causal.go index 8b101a817..496eeaa64 100644 --- a/kvcache/causal.go +++ b/kvcache/causal.go @@ -646,18 +646,31 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error { seqRange := c.cellRanges[seq] for start := seqRange.min; start <= seqRange.max; start += c.maxBatch { - ctx := c.backend.NewContext() - size := min(seqRange.max-start+1, c.maxBatch) offsets := make([]int32, size) + + var batchFirst, batchLast int + + batchFirst = -1 for i := range offsets { cell := c.cells[start+i] if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex { offsets[i] = offset + if batchFirst < 0 { + batchFirst = i + } + batchLast = i } } + if batchFirst < 0 { + continue + } + + offsets = offsets[batchFirst : batchLast+1] + + ctx := c.backend.NewContext() kShift := ctx.Input().FromIntSlice(offsets, len(offsets)) for i, key := range c.keys { @@ -669,10 +682,10 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error { numKVHeads := key.Dim(1) rowSize := key.Stride(2) - key = key.View(ctx, rowSize*start, + key = key.View(ctx, rowSize*(start+batchFirst), kHeadDim, key.Stride(1), numKVHeads, key.Stride(2), - size, + len(offsets), ) roped, err := c.shiftFn(ctx, i, key, kShift) diff --git a/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch new file mode 100644 index 000000000..b9dd6cdc6 --- /dev/null +++ b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch @@ -0,0 +1,50 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Oliver Simons +Date: Tue, 22 Jul 2025 11:02:28 +0200 +Subject: [PATCH] Enable CUDA Graphs for gemma3n. + +Similar to +https://github.com/ggml-org/llama.cpp/pull/14741, +though ollama has a slightly different model graph +than llama.cpp which requires different workaround +checks. +--- + ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 2b9fabf4..28ccf4be 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + // Loop over nodes in GGML graph to obtain info needed for CUDA graph + cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); + ++ const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)"; ++ const std::string gemma3n_node_name = "node_"; ++ + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + +@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + #endif + } + +- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { +- // disable CUDA graphs for batch size > 1 for now. +- // Changes in batch size or context size can cause changes to the grid size of some kernels. ++ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n ++ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here ++ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256 ++ && node->ne[2] == 1 ++ && node->ne[3] == 1 ++ && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false ++ && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) { ++ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. + use_cuda_graph = false; + #ifndef NDEBUG +- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++ GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + #endif + } + diff --git a/llama/patches/0022-BF16-macos-version-guard.patch b/llama/patches/0022-BF16-macos-version-guard.patch new file mode 100644 index 000000000..68aac0bb0 --- /dev/null +++ b/llama/patches/0022-BF16-macos-version-guard.patch @@ -0,0 +1,27 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Daniel Hiltgen +Date: Wed, 30 Jul 2025 08:43:46 -0700 +Subject: [PATCH] BF16 macos version guard + +Only enable BF16 on supported MacOS versions (v14+) +--- + ggml/src/ggml-metal/ggml-metal.m | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m +index 110c9ece..ab46f6e3 100644 +--- a/ggml/src/ggml-metal/ggml-metal.m ++++ b/ggml/src/ggml-metal/ggml-metal.m +@@ -89,7 +89,11 @@ + ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; + + #if defined(GGML_METAL_USE_BF16) +- ctx->use_bfloat = ctx->has_bfloat; ++ if (@available(macOS 14.0, *)) { ++ ctx->use_bfloat = ctx->has_bfloat; ++ } else { ++ ctx->use_bfloat = false; ++ } + #else + ctx->use_bfloat = false; + #endif diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 1610e5d4b..0ae0933f3 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2593,6 +2593,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud // Loop over nodes in GGML graph to obtain info needed for CUDA graph cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); + const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)"; + const std::string gemma3n_node_name = "node_"; + for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2614,12 +2617,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { - // disable CUDA graphs for batch size > 1 for now. - // Changes in batch size or context size can cause changes to the grid size of some kernels. + // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n + // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here + if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256 + && node->ne[2] == 1 + && node->ne[3] == 1 + && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false + && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) { + // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); #endif } diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m index 731c25eba..5576239a9 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m @@ -99,7 +99,11 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; #if defined(GGML_METAL_USE_BF16) - ctx->use_bfloat = ctx->has_bfloat; + if (@available(macOS 14.0, *)) { + ctx->use_bfloat = ctx->has_bfloat; + } else { + ctx->use_bfloat = false; + } #else ctx->use_bfloat = false; #endif diff --git a/model/models/gemma3n/model_text.go b/model/models/gemma3n/model_text.go index 715b8a0ea..b75a2abb3 100644 --- a/model/models/gemma3n/model_text.go +++ b/model/models/gemma3n/model_text.go @@ -203,10 +203,9 @@ func (a AltUp) Predict(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions coefficients := a.PredictionCoefficient.Forward(ctx, modalities) coefficients = coefficients.Reshape(ctx, opts.altupInputs, opts.altupInputs, coefficients.Dim(1), coefficients.Dim(2)) - hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx) - predictions := coefficients.Mulmat(ctx, hiddenStates) - predictions = predictions.Add(ctx, hiddenStates) - return predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) + predictions := coefficients.Mulmat(ctx, hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)) + predictions = predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) + return predictions.Add(ctx, hiddenStates) } func (a AltUp) Correct(ctx ml.Context, predictions, activated, one ml.Tensor, opts *TextOptions) ml.Tensor {