Compare commits

..

6 Commits

Author SHA1 Message Date
Bruce MacDonald
057cc54b66 benchmark: compare backend graph computation times
Track execution time of individual tensor operations (views, copies, reshapes etc)
during LLM forward passes using CGo bindings to the native graph runtime. This
helps identify performance bottlenecks in the computation graph and optimize memory
operations that can significantly impact inference latency.
2025-02-19 15:22:53 -08:00
Michael Yang
1e438b237c Merge pull request #9203 from ollama/mxyng/sapphirerapids
build: remove backend build for sapphirerapids
2025-02-19 21:42:00 +00:00
yuiseki
d721a02e7d test: add test cases for ListHandler (#9146) 2025-02-19 13:24:27 -08:00
zyxucp
778603a818 docs: Add AntSK to Community Integrations (#9214) 2025-02-19 13:22:48 -08:00
maninhill
3c874df46e docs: Add MaxKB to Community Integrations (#9212) 2025-02-19 13:20:09 -08:00
Michael Yang
5f8c03189e build: remove backend build for sapphirerapids
sapphire rapids has amx support but it ends up having a negative
performance impact.

emerald rapids also has amx support with a positive performance impact
however there's no reasonable way in ggml to differentiate between the
two. the impact is small (~6%) so disable amx entirely for simplicity
2025-02-18 14:47:58 -08:00
10 changed files with 342 additions and 343 deletions

View File

@@ -382,6 +382,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
- [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
- [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
### Cloud

View File

@@ -0,0 +1,86 @@
package backend
import (
"flag"
"fmt"
"io"
"log"
"os"
"testing"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/server"
_ "github.com/ollama/ollama/model/models/llama"
)
var modelName = flag.String("m", "", "Name of the model to benchmark")
func suppressOutput() (cleanup func()) {
oldStdout, oldStderr := os.Stdout, os.Stderr
os.Stdout, os.Stderr = nil, nil
log.SetOutput(io.Discard)
return func() {
os.Stdout, os.Stderr = oldStdout, oldStderr
log.SetOutput(os.Stderr)
}
}
func setupModel(b *testing.B) model.Model {
if *modelName == "" {
b.Fatal("Error: -m flag is required for benchmark tests")
}
sm, err := server.GetModel(*modelName)
if err != nil {
b.Fatal(err)
}
m, err := model.New(sm.ModelPath)
if err != nil {
b.Fatal(err)
}
m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
return m
}
func BenchmarkGGMLOperations(b *testing.B) {
// loading the GGML back-end logs to standard out and makes the bench output messy
cleanup := suppressOutput()
defer cleanup()
b.Setenv("OLLAMA_BENCHMARK", "1")
b.Setenv("OLLAMA_BACKEND", "ggml")
m := setupModel(b)
// Sample input data
inputIDs := []int32{1, 2, 3, 4, 5}
options := model.Options{
Inputs: inputIDs,
Positions: []int32{1, 2, 3, 4, 5},
Sequences: []int{1, 1, 1, 1, 1},
Outputs: []int32{int32(len(inputIDs) - 1)},
}
b.ResetTimer()
for range b.N {
ctx := m.Backend().NewContext()
defer ctx.Close()
modelOutput, err := model.Forward(ctx, m, options)
if err != nil {
b.Fatal(fmt.Errorf("forward pass failed: %v", err))
}
ctx.Compute(modelOutput)
for _, op := range ctx.Timing() {
b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
}
}
}

View File

@@ -10,6 +10,7 @@ import (
"os"
"strings"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/spf13/cobra"
@@ -490,6 +491,96 @@ func TestPushHandler(t *testing.T) {
}
}
func TestListHandler(t *testing.T) {
tests := []struct {
name string
args []string
serverResponse []api.ListModelResponse
expectedError string
expectedOutput string
}{
{
name: "list all models",
args: []string{},
serverResponse: []api.ListModelResponse{
{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-48 * time.Hour)},
},
expectedOutput: "NAME ID SIZE MODIFIED \n" +
"model1 sha256:abc12 1.0 KB 24 hours ago \n" +
"model2 sha256:def45 2.0 KB 2 days ago \n",
},
{
name: "filter models by prefix",
args: []string{"model1"},
serverResponse: []api.ListModelResponse{
{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-24 * time.Hour)},
},
expectedOutput: "NAME ID SIZE MODIFIED \n" +
"model1 sha256:abc12 1.0 KB 24 hours ago \n",
},
{
name: "server error",
args: []string{},
expectedError: "server error",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/tags" || r.Method != http.MethodGet {
t.Errorf("unexpected request to %s %s", r.Method, r.URL.Path)
http.Error(w, "not found", http.StatusNotFound)
return
}
if tt.expectedError != "" {
http.Error(w, tt.expectedError, http.StatusInternalServerError)
return
}
response := api.ListResponse{Models: tt.serverResponse}
if err := json.NewEncoder(w).Encode(response); err != nil {
t.Fatal(err)
}
}))
defer mockServer.Close()
t.Setenv("OLLAMA_HOST", mockServer.URL)
cmd := &cobra.Command{}
cmd.SetContext(context.TODO())
// Capture stdout
oldStdout := os.Stdout
r, w, _ := os.Pipe()
os.Stdout = w
err := ListHandler(cmd, tt.args)
// Restore stdout and get output
w.Close()
os.Stdout = oldStdout
output, _ := io.ReadAll(r)
if tt.expectedError == "" {
if err != nil {
t.Errorf("expected no error, got %v", err)
}
if got := string(output); got != tt.expectedOutput {
t.Errorf("expected output:\n%s\ngot:\n%s", tt.expectedOutput, got)
}
} else {
if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
}
}
})
}
}
func TestCreateHandler(t *testing.T) {
tests := []struct {
name string

View File

@@ -1,338 +0,0 @@
# Guide: Implementing Models in Ollama's Go Inference Engine
> **Note**: This guide and the Go inference engine are in early development and will be updated as implementation details evolve.
This guide outlines the process of implementing a new model in Ollama's inference engine. It covers everything from initial setup to publishing your model to ollama.com.
## Architecture Overview
Below is a diagram showing Ollama's inference engine architecture layers and how they interact:
```mermaid
graph TB
subgraph Models["Model Layer: LLM Implementations"]
direction TB
llama["model/models/llama"]
mllama["model/models/mllama"]
qwen["model/models/qwen2"]
etc["...etc"]
note1[" Each model implements a<br>specific architecture:<br>- Defines model parameters<br>- Implements forward pass"]
end
subgraph ML_Ops["Neural Network Operations"]
direction TB
nn_ops[" nn/<br>linear.go: Matrix multiplication<br>embedding.go: Token embedding lookups<br>normalization.go: Layer norm operations<br>convolution.go: Convolutional operations "]
backend[" ml/backend.go<br>Hardware Abstraction Layer:<br>- Defines tensor operations<br>- Manages computation graphs<br>- Handles memory allocation "]
note2[" Common neural net operations:<br>- Abstracts hardware details<br>- Provides unified API<br>- Manages computation flow "]
end
subgraph Hardware["Backend Execution Layer"]
direction TB
backend_impl[" The backend package provides:<br>- Unified computation interface<br>- Automatic hardware selection<br>- Optimized kernels<br>- Efficient memory management "]
subgraph Backends["Backend Implementations"]
direction LR
cpu["backend/cpu<br>- Pure Go implementation<br>- Fallback for all platforms"]
metal["backend/metal<br>- Apple Silicon (M1/M2/M3)<br>- MLX integration<br>- Leverages Apple Neural Engine"]
onnx["backend/onnx<br>- Cross-platform compatibility<br>- ONNX Runtime integration<br>- Pre-compiled graph execution"]
ggml["backend/ggml<br>- CPU/GPU quantized compute<br>- Low-precision operations<br>- Memory-efficient inferencing"]
end
end
Models --> |" Makes high-level calls<br>(e.g., self-attention) "| ML_Ops
ML_Ops --> |" Translates to tensor operations<br>(e.g., matmul, softmax) "| Hardware
backend_impl --> Backends
```
When implementing a new model, you'll primarily work in the model layer, interfacing with the neural network operations layer.
## Implementation Process Overview
Here's the high-level process for implementing a new model in Ollama:
1. **Environment Setup**: Clone the repository and set up your development environment
2. **Research Implementation**: Understand the original model architecture
3. **Project Structure Setup**: Set up the necessary file structure
4. **Create Basic Modelfile**: Create a simple Modelfile for testing
5. **Implement Weight Conversion**: Map from original format to GGUF
6. **Open a Draft PR**: Create a draft pull request to establish communication with maintainers
7. **Implement Model Logic**: Create the model architecture and forward pass
8. **Quality Check and Final Steps**: Create a Modelfile, add tests and ensure functionality
10. **Finalize PR and Publish**: Complete the PR and publish to ollama.com
## Implementation Steps in Detail
### 1. Environment Setup
First, clone the Ollama repository and get it running locally. Follow the development setup guide at:
https://github.com/ollama/ollama/blob/main/docs/development.md
### 2. Research Implementation
Get the original model implementation running. This typically involves:
- Cloning the research code repository (usually Python-based)
- Setting up the required environment
- Running inference with sample inputs
- Understanding the model architecture and forward pass
### 3. Project Structure Setup
Create the necessary file structure by referencing previous model implementations. You'll need:
```
convert/
└── convert_your-model.go # Weight conversion logic (PyTorch/SafeTensors to GGML)
model/
└── your-model/
└── model.go # Architecture and forward pass implementation
```
Add your model to the main paths in [model/models/models.go](https://github.com/ollama/ollama/blob/main/model/models/models.go):
```
package models
import (
_ "github.com/ollama/ollama/model/models/llama"
_ "github.com/ollama/ollama/model/models/mllama"
_ "github.com/ollama/ollama/model/models/your-model" // Add your model here
)
```
### 4. Create a Basic Modelfile
Create a simple Modelfile early in the process to facilitate testing:
```
FROM /path/to/model
TEMPLATE "{{.Prompt}}" # Use a static prompt format for initial testing
```
This allows you to test your implementation with consistent inputs before finalizing the proper prompt template.
### 5. Implement Weight Conversion
- Work on `convert/convert_your-model.go`
- Reference existing conversion implementations
- Conversion involves mapping from PyTorch/SafeTensors naming to GGUF naming as you see fit
- Understand typical GGUF layout and structure:
**Typical GGUF Layout:**
```
GGUF
├── Metadata Section
│ ├── Model Parameters
│ │ ├── General architecture parameters
│ │ │ ├── "{arch}.vocab_size" (e.g., "llama.vocab_size")
│ │ │ ├── "{arch}.context_length" (e.g., "llama.context_length")
│ │ │ ├── "{arch}.embedding_length" (e.g., "llama.embedding_length")
│ │ │ └── "{arch}.block_count" (e.g., "llama.block_count")
│ │ │
│ │ └── Architecture-specific parameters
│ │ ├── "{arch}.attention.head_count" (e.g., "llama.attention.head_count")
│ │ ├── "{arch}.attention.head_count_kv" (e.g., "llama.attention.head_count_kv")
│ │ ├── "{arch}.rope.dimension_count" (e.g., "llama.rope.dimension_count")
│ │ └── "{arch}.attention.layer_norm_rms_epsilon" (e.g., "llama.attention.layer_norm_rms_epsilon")
│ │
│ ├── Tokenizer parameters
│ │ ├── "tokenizer.ggml.model" (e.g., "llama")
│ │ ├── "tokenizer.ggml.tokens" (vocabulary tokens)
│ │ ├── "tokenizer.ggml.bos_id" (beginning of sequence token ID)
│ │ └── "tokenizer.ggml.eos_id" (end of sequence token ID)
│ │
│ └── General metadata
│ └── "general.architecture" (e.g., "llama", "qwen2", "phi")
└── Tensor Data Section
├── Common tensors:
│ ├── "token_embd.weight" (token embedding matrix)
│ ├── "rope_freqs.weight" (RoPE frequency weights)
│ ├── "output_norm.weight" (final layer normalization)
│ └── "output.weight" (output projection)
└── Layer-specific tensors:
├── "blk.{i}.attn_q.weight" (query projection)
├── "blk.{i}.attn_k.weight" (key projection)
├── "blk.{i}.attn_v.weight" (value projection)
├── "blk.{i}.attn_output.weight" (attention output)
├── "blk.{i}.attn_norm.weight" (attention normalization)
├── "blk.{i}.ffn_norm.weight" (feed-forward normalization)
├── "blk.{i}.ffn_up.weight" (FFN up projection)
├── "blk.{i}.ffn_down.weight" (FFN down projection)
└── "blk.{i}.ffn_gate.weight" (FFN gate projection)
```
- Key conversion details include:
- Linear weight matrices (sometimes need transposition)
- Layer normalization weights (might need reshaping)
- **Note: In GGML, FFN values are for the MLP (Multi-Layer Perceptron) part of the architecture**
- Test conversion:
```bash
go run . create <my-model> -f /path/to/Modelfile
```
### 6. Open a Draft PR
After implementing the initial weight conversion, creating a draft pull request is recommended as it:
- Establishes a communication channel with Ollama maintainers
- Allows for early feedback on your approach
- Makes it easier to track progress and changes
To open a draft PR:
1. Fork the repository
2. Create a new branch for your model implementation
3. Make initial commits with your weight conversion implementation
4. Open a PR in the `ollama/ollama` repository and mark it as draft
5. Include a clear description of the model you're implementing
### 7. Implement Model Logic
- Reference existing model implementations
- Implement `New()` and `Forward()` functions in `model.go`:
**The `New()` function:**
- Creates and initializes your model structure
- Loads configuration parameters (embedding size, attention heads, etc.)
- Sets up the tokenizer with vocabulary and special tokens
- Initializes all model layers and weights
- **Important**: Sets up the KV cache for efficient inference
- Example:
```go
func New(c ml.Config) (model.Model, error) {
m := &Model{
// Initialize tokenizer
BytePairEncoding: model.NewBytePairEncoding(...),
// Create layer arrays
Layers: make([]Layer, c.Uint("block_count")),
// Set model parameters
Options: &Options{...},
}
// Initialize KV cache for efficient inference
m.Cache = kvcache.NewCausalCache(m.Shift)
return m, nil
}
```
**The `Forward()` function:**
- **What it does**: Defines the computational graph of your model
- **Important**: The graph is NOT executed immediately - it's built first, then executed later when predictions are needed
- Takes input tokens and converts them to embeddings
- Processes inputs through transformer layers (attention and feed-forward networks)
- Creates the path for data flow through your model's components
- Example:
```go
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
// Convert inputs to tensors
inputTensor, _ := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
positionsTensor, _ := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
// Initial token embedding
hiddenStates := m.TokenEmbedding.Forward(ctx, inputTensor)
// Process through transformer layers
for i, layer := range m.Layers {
m.Cache.SetLayer(i)
hiddenStates = layer.Forward(ctx, hiddenStates, positionsTensor, m.Cache, m.Options)
}
// Final processing and output
normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)
logits := m.Output.Forward(ctx, normalizedOutput)
// Return logits for requested positions
outputsTensor, _ := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
return logits.Rows(ctx, outputsTensor), nil
}
```
**Key Components to Implement:**
1. **KV Cache**:
- Improves inference performance for text generation
- How it works: Stores previously computed key and value tensors from self-attention, avoiding redundant computations
- Implementation: Use the `kvcache.NewCausalCache()` for autoregressive models
- Important: Must implement the `Shift()` function to handle rotary position embeddings with the cache
2. **Self-Attention**:
- Core component that learns contextual relationships between tokens
- Implements query, key, value projections and their interactions
- Must handle positional encoding (usually Rotary Position Embeddings)
- Uses the KV cache to make generation efficient
3. **Normalization Layers**:
- Purpose: Stabilizes training and maintains consistent activation distributions
- Types: RMSNorm, LayerNorm, etc. depending on model architecture
- Implementation: Apply before attention and feed-forward networks
- Example: `normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)`
4. **Activation Functions**:
- Purpose: Introduces non-linearity into the model
- Common types: SILU (Sigmoid Linear Unit), GELU, ReLU
- Found in feed-forward/MLP blocks
- Example:
```go
// SwiGLU activation in MLP
gateActivation := mlp.Gate.Forward(ctx, hiddenState).SILU(ctx)
upProjection := mlp.Up.Forward(ctx, hiddenState)
intermediateStates := gateActivation.Mul(ctx, upProjection)
```
- Run your forward pass:
```bash
# in the root of the ollama directory
go build .
OLLAMA_DEBUG=1 ./ollama serve
OLLAMA_DEBUG=1 ./ollama run <my-model>
```
- Compare output with research implementation
### 8. Quality Check and Final Steps
1. Add comprehensive tests to:
- `model_test.go`
- `convert_test.go`
2. Ensure tests cover:
- Weight conversion
- Model initialization
- Text generation
3. **Create Final Modelfile**
- Replace the static prompt with the proper Go template for your model:
```
FROM <converted-gguf>
TEMPLATE <prompt-template> # Add the proper Go template for your model, including tools if needed
LICENSE <license-info> # Add appropriate license information
# Add additional parameters if needed
```
4. **End-to-end Testing**
- Run your model with your local Ollama build to ensure that it functions as expected
5. Benchmark
- Run performance benchmarks on your model implementation
```go
# from the root of the Ollama directory, while a server is running locally
go build .
OLLAMA_DEBUG=1 ./ollama serve
go test -bench=. -m <your-model-name> ./...
```
### 9. Finalize PR and Publish to ollama.com
1. **Finalize Pull Request**
- Move PR out of draft state
- Address reviewer feedback
2. **Publish to ollama.com**
- Push to ollama.com:
```bash
ollama create <your-namespace>/<your-model> -f /path/to/Modelfile
ollama push <your-namespace>/<your-model>
```

View File

@@ -167,6 +167,8 @@ var (
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
// Enable the new Ollama engine
NewEngine = Bool("OLLAMA_NEW_ENGINE")
// Ollama is running in a benchmark context, additional timing data will be collected.
Benchmark = Bool("OLLAMA_BENCHMARK")
)
func String(s string) func() string {

View File

@@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
return 10
}
func (c *testContext) Timing() []ml.OpTiming {
return []ml.OpTiming{}
}
func (c *testContext) Close() {}
type testTensor struct {

View File

@@ -0,0 +1,24 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 18 Feb 2025 14:47:21 -0800
Subject: [PATCH] remove amx
---
ggml/src/CMakeLists.txt | 4 ----
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 72b488dd..50828717 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
- if (NOT MSVC)
- # MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- endif()
else ()
ggml_add_cpu_backend_variant_impl("")
endif()

View File

@@ -2,6 +2,7 @@ package ml
import (
"bytes"
"cmp"
"encoding/binary"
"fmt"
"os"
@@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
}
func NewBackend(f *os.File) (Backend, error) {
if backend, ok := backends["ggml"]; ok {
if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
return backend(f)
}
@@ -53,6 +54,30 @@ type Context interface {
Compute(...Tensor)
MaxTensors() int
Close()
Timing() []OpTiming
}
// OpType is the type of operation performed during a forward pass.
type OpType string
const (
View OpType = "View"
Copy OpType = "Copy"
Reshape OpType = "Reshape"
Permute OpType = "Permute"
Contiguous OpType = "Contiguous"
Input OpType = "Input"
ComputeOp OpType = "Compute"
Transpose OpType = "Transpose"
)
// OpTiming stores the timing information for a single operation.
type OpTiming struct {
Type OpType
Operation string
Duration float64
Order int
}
type Tensor interface {

View File

@@ -4,6 +4,8 @@ package ggml
#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <string.h>
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"
@@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
#endif
}
// Define a fixed-size struct to store timing data
#define MAX_TENSOR_NAME 256
#define MAX_TIMINGS 1000
typedef struct {
char tensor_name[MAX_TENSOR_NAME];
double duration_ms;
} timing_entry;
typedef struct {
timing_entry entries[MAX_TIMINGS];
int count;
} timing_data;
// Global timing data structure
timing_data g_timings = {0};
double get_time_ms() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
}
bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
static double start_time;
static char current_tensor[MAX_TENSOR_NAME];
if (ask) {
start_time = get_time_ms();
strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
current_tensor[MAX_TENSOR_NAME - 1] = '\0';
} else {
double end_time = get_time_ms();
double duration = end_time - start_time;
if (g_timings.count < MAX_TIMINGS) {
strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
g_timings.entries[g_timings.count].duration_ms = duration;
g_timings.count++;
}
}
return true;
}
void clear_timings() {
g_timings.count = 0;
}
*/
import "C"
@@ -29,9 +79,11 @@ import (
"io"
"log/slog"
"os"
"strings"
"sync"
"unsafe"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
fs "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
@@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
}
// Timing retrieves the collected timing data
func (c *Context) Timing() []ml.OpTiming {
sequence := make([]ml.OpTiming, C.g_timings.count)
for i := range int(C.g_timings.count) {
entry := C.g_timings.entries[i]
tensorName := C.GoString(&entry.tensor_name[0])
// Determine operation type and description based on tensor name
var opType ml.OpType
var opDesc string
switch {
case strings.Contains(tensorName, "(view)"):
opType, opDesc = ml.View, "Memory view"
case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
opType, opDesc = ml.Copy, "Memory copy"
case strings.Contains(tensorName, "(reshaped)"):
opType, opDesc = ml.Reshape, "Reshape"
case strings.Contains(tensorName, "(permuted)"):
opType, opDesc = ml.Permute, "Permute dimensions"
case strings.Contains(tensorName, "(cont)"):
opType, opDesc = ml.Contiguous, "Make contiguous"
case strings.Contains(tensorName, "(transposed)"):
opType, opDesc = ml.Transpose, "Transpose"
case strings.HasPrefix(tensorName, "leaf_"):
opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
case strings.HasPrefix(tensorName, "node_"):
opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
default:
opType, opDesc = "Unknown", tensorName
}
sequence[i] = ml.OpTiming{
Type: opType,
Operation: opDesc,
Duration: float64(entry.duration_ms),
Order: i,
}
}
return sequence
}
func (c *Context) Compute(tensors ...ml.Tensor) {
if envconfig.Benchmark() {
// Clear previous timings before new computation
C.clear_timings()
C.ggml_backend_sched_set_eval_callback(
c.sched,
C.ggml_backend_eval_callback(C.debug_callback),
nil,
)
}
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
needSync := true

View File

@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
if (NOT MSVC)
# MSVC doesn't support AMX
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
endif()
else ()
ggml_add_cpu_backend_variant_impl("")
endif()