Compare commits
6 Commits
brucemacd/
...
brucemacd/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
057cc54b66 | ||
|
|
1e438b237c | ||
|
|
d721a02e7d | ||
|
|
778603a818 | ||
|
|
3c874df46e | ||
|
|
5f8c03189e |
@@ -382,6 +382,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
|
||||
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
|
||||
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
|
||||
- [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
|
||||
- [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
|
||||
|
||||
### Cloud
|
||||
|
||||
|
||||
86
benchmark/ggml_backend_benchmark_test.go
Normal file
86
benchmark/ggml_backend_benchmark_test.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/server"
|
||||
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
)
|
||||
|
||||
var modelName = flag.String("m", "", "Name of the model to benchmark")
|
||||
|
||||
func suppressOutput() (cleanup func()) {
|
||||
oldStdout, oldStderr := os.Stdout, os.Stderr
|
||||
os.Stdout, os.Stderr = nil, nil
|
||||
log.SetOutput(io.Discard)
|
||||
|
||||
return func() {
|
||||
os.Stdout, os.Stderr = oldStdout, oldStderr
|
||||
log.SetOutput(os.Stderr)
|
||||
}
|
||||
}
|
||||
|
||||
func setupModel(b *testing.B) model.Model {
|
||||
if *modelName == "" {
|
||||
b.Fatal("Error: -m flag is required for benchmark tests")
|
||||
}
|
||||
|
||||
sm, err := server.GetModel(*modelName)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
m, err := model.New(sm.ModelPath)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
|
||||
return m
|
||||
}
|
||||
|
||||
func BenchmarkGGMLOperations(b *testing.B) {
|
||||
// loading the GGML back-end logs to standard out and makes the bench output messy
|
||||
cleanup := suppressOutput()
|
||||
defer cleanup()
|
||||
|
||||
b.Setenv("OLLAMA_BENCHMARK", "1")
|
||||
b.Setenv("OLLAMA_BACKEND", "ggml")
|
||||
|
||||
m := setupModel(b)
|
||||
|
||||
// Sample input data
|
||||
inputIDs := []int32{1, 2, 3, 4, 5}
|
||||
options := model.Options{
|
||||
Inputs: inputIDs,
|
||||
Positions: []int32{1, 2, 3, 4, 5},
|
||||
Sequences: []int{1, 1, 1, 1, 1},
|
||||
Outputs: []int32{int32(len(inputIDs) - 1)},
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for range b.N {
|
||||
ctx := m.Backend().NewContext()
|
||||
defer ctx.Close()
|
||||
|
||||
modelOutput, err := model.Forward(ctx, m, options)
|
||||
if err != nil {
|
||||
b.Fatal(fmt.Errorf("forward pass failed: %v", err))
|
||||
}
|
||||
|
||||
ctx.Compute(modelOutput)
|
||||
|
||||
for _, op := range ctx.Timing() {
|
||||
b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/spf13/cobra"
|
||||
@@ -490,6 +491,96 @@ func TestPushHandler(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestListHandler(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
args []string
|
||||
serverResponse []api.ListModelResponse
|
||||
expectedError string
|
||||
expectedOutput string
|
||||
}{
|
||||
{
|
||||
name: "list all models",
|
||||
args: []string{},
|
||||
serverResponse: []api.ListModelResponse{
|
||||
{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
|
||||
{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-48 * time.Hour)},
|
||||
},
|
||||
expectedOutput: "NAME ID SIZE MODIFIED \n" +
|
||||
"model1 sha256:abc12 1.0 KB 24 hours ago \n" +
|
||||
"model2 sha256:def45 2.0 KB 2 days ago \n",
|
||||
},
|
||||
{
|
||||
name: "filter models by prefix",
|
||||
args: []string{"model1"},
|
||||
serverResponse: []api.ListModelResponse{
|
||||
{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
|
||||
{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-24 * time.Hour)},
|
||||
},
|
||||
expectedOutput: "NAME ID SIZE MODIFIED \n" +
|
||||
"model1 sha256:abc12 1.0 KB 24 hours ago \n",
|
||||
},
|
||||
{
|
||||
name: "server error",
|
||||
args: []string{},
|
||||
expectedError: "server error",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/api/tags" || r.Method != http.MethodGet {
|
||||
t.Errorf("unexpected request to %s %s", r.Method, r.URL.Path)
|
||||
http.Error(w, "not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
if tt.expectedError != "" {
|
||||
http.Error(w, tt.expectedError, http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
response := api.ListResponse{Models: tt.serverResponse}
|
||||
if err := json.NewEncoder(w).Encode(response); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}))
|
||||
defer mockServer.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
||||
|
||||
cmd := &cobra.Command{}
|
||||
cmd.SetContext(context.TODO())
|
||||
|
||||
// Capture stdout
|
||||
oldStdout := os.Stdout
|
||||
r, w, _ := os.Pipe()
|
||||
os.Stdout = w
|
||||
|
||||
err := ListHandler(cmd, tt.args)
|
||||
|
||||
// Restore stdout and get output
|
||||
w.Close()
|
||||
os.Stdout = oldStdout
|
||||
output, _ := io.ReadAll(r)
|
||||
|
||||
if tt.expectedError == "" {
|
||||
if err != nil {
|
||||
t.Errorf("expected no error, got %v", err)
|
||||
}
|
||||
if got := string(output); got != tt.expectedOutput {
|
||||
t.Errorf("expected output:\n%s\ngot:\n%s", tt.expectedOutput, got)
|
||||
}
|
||||
} else {
|
||||
if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
|
||||
t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateHandler(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
|
||||
@@ -1,338 +0,0 @@
|
||||
# Guide: Implementing Models in Ollama's Go Inference Engine
|
||||
|
||||
> **Note**: This guide and the Go inference engine are in early development and will be updated as implementation details evolve.
|
||||
|
||||
This guide outlines the process of implementing a new model in Ollama's inference engine. It covers everything from initial setup to publishing your model to ollama.com.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
Below is a diagram showing Ollama's inference engine architecture layers and how they interact:
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph Models["Model Layer: LLM Implementations"]
|
||||
direction TB
|
||||
llama["model/models/llama"]
|
||||
mllama["model/models/mllama"]
|
||||
qwen["model/models/qwen2"]
|
||||
etc["...etc"]
|
||||
|
||||
note1[" Each model implements a<br>specific architecture:<br>- Defines model parameters<br>- Implements forward pass"]
|
||||
end
|
||||
|
||||
subgraph ML_Ops["Neural Network Operations"]
|
||||
direction TB
|
||||
nn_ops[" nn/<br>linear.go: Matrix multiplication<br>embedding.go: Token embedding lookups<br>normalization.go: Layer norm operations<br>convolution.go: Convolutional operations "]
|
||||
|
||||
backend[" ml/backend.go<br>Hardware Abstraction Layer:<br>- Defines tensor operations<br>- Manages computation graphs<br>- Handles memory allocation "]
|
||||
|
||||
note2[" Common neural net operations:<br>- Abstracts hardware details<br>- Provides unified API<br>- Manages computation flow "]
|
||||
end
|
||||
|
||||
subgraph Hardware["Backend Execution Layer"]
|
||||
direction TB
|
||||
backend_impl[" The backend package provides:<br>- Unified computation interface<br>- Automatic hardware selection<br>- Optimized kernels<br>- Efficient memory management "]
|
||||
|
||||
subgraph Backends["Backend Implementations"]
|
||||
direction LR
|
||||
cpu["backend/cpu<br>- Pure Go implementation<br>- Fallback for all platforms"]
|
||||
|
||||
metal["backend/metal<br>- Apple Silicon (M1/M2/M3)<br>- MLX integration<br>- Leverages Apple Neural Engine"]
|
||||
|
||||
onnx["backend/onnx<br>- Cross-platform compatibility<br>- ONNX Runtime integration<br>- Pre-compiled graph execution"]
|
||||
|
||||
ggml["backend/ggml<br>- CPU/GPU quantized compute<br>- Low-precision operations<br>- Memory-efficient inferencing"]
|
||||
end
|
||||
end
|
||||
|
||||
Models --> |" Makes high-level calls<br>(e.g., self-attention) "| ML_Ops
|
||||
ML_Ops --> |" Translates to tensor operations<br>(e.g., matmul, softmax) "| Hardware
|
||||
backend_impl --> Backends
|
||||
```
|
||||
|
||||
When implementing a new model, you'll primarily work in the model layer, interfacing with the neural network operations layer.
|
||||
|
||||
## Implementation Process Overview
|
||||
|
||||
Here's the high-level process for implementing a new model in Ollama:
|
||||
|
||||
1. **Environment Setup**: Clone the repository and set up your development environment
|
||||
2. **Research Implementation**: Understand the original model architecture
|
||||
3. **Project Structure Setup**: Set up the necessary file structure
|
||||
4. **Create Basic Modelfile**: Create a simple Modelfile for testing
|
||||
5. **Implement Weight Conversion**: Map from original format to GGUF
|
||||
6. **Open a Draft PR**: Create a draft pull request to establish communication with maintainers
|
||||
7. **Implement Model Logic**: Create the model architecture and forward pass
|
||||
8. **Quality Check and Final Steps**: Create a Modelfile, add tests and ensure functionality
|
||||
10. **Finalize PR and Publish**: Complete the PR and publish to ollama.com
|
||||
|
||||
## Implementation Steps in Detail
|
||||
|
||||
### 1. Environment Setup
|
||||
|
||||
First, clone the Ollama repository and get it running locally. Follow the development setup guide at:
|
||||
https://github.com/ollama/ollama/blob/main/docs/development.md
|
||||
|
||||
### 2. Research Implementation
|
||||
|
||||
Get the original model implementation running. This typically involves:
|
||||
- Cloning the research code repository (usually Python-based)
|
||||
- Setting up the required environment
|
||||
- Running inference with sample inputs
|
||||
- Understanding the model architecture and forward pass
|
||||
|
||||
### 3. Project Structure Setup
|
||||
|
||||
Create the necessary file structure by referencing previous model implementations. You'll need:
|
||||
|
||||
```
|
||||
convert/
|
||||
└── convert_your-model.go # Weight conversion logic (PyTorch/SafeTensors to GGML)
|
||||
model/
|
||||
└── your-model/
|
||||
└── model.go # Architecture and forward pass implementation
|
||||
```
|
||||
|
||||
Add your model to the main paths in [model/models/models.go](https://github.com/ollama/ollama/blob/main/model/models/models.go):
|
||||
|
||||
```
|
||||
package models
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
_ "github.com/ollama/ollama/model/models/mllama"
|
||||
_ "github.com/ollama/ollama/model/models/your-model" // Add your model here
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Create a Basic Modelfile
|
||||
|
||||
Create a simple Modelfile early in the process to facilitate testing:
|
||||
|
||||
```
|
||||
FROM /path/to/model
|
||||
TEMPLATE "{{.Prompt}}" # Use a static prompt format for initial testing
|
||||
```
|
||||
|
||||
This allows you to test your implementation with consistent inputs before finalizing the proper prompt template.
|
||||
|
||||
### 5. Implement Weight Conversion
|
||||
|
||||
- Work on `convert/convert_your-model.go`
|
||||
- Reference existing conversion implementations
|
||||
- Conversion involves mapping from PyTorch/SafeTensors naming to GGUF naming as you see fit
|
||||
- Understand typical GGUF layout and structure:
|
||||
|
||||
**Typical GGUF Layout:**
|
||||
```
|
||||
GGUF
|
||||
├── Metadata Section
|
||||
│ ├── Model Parameters
|
||||
│ │ ├── General architecture parameters
|
||||
│ │ │ ├── "{arch}.vocab_size" (e.g., "llama.vocab_size")
|
||||
│ │ │ ├── "{arch}.context_length" (e.g., "llama.context_length")
|
||||
│ │ │ ├── "{arch}.embedding_length" (e.g., "llama.embedding_length")
|
||||
│ │ │ └── "{arch}.block_count" (e.g., "llama.block_count")
|
||||
│ │ │
|
||||
│ │ └── Architecture-specific parameters
|
||||
│ │ ├── "{arch}.attention.head_count" (e.g., "llama.attention.head_count")
|
||||
│ │ ├── "{arch}.attention.head_count_kv" (e.g., "llama.attention.head_count_kv")
|
||||
│ │ ├── "{arch}.rope.dimension_count" (e.g., "llama.rope.dimension_count")
|
||||
│ │ └── "{arch}.attention.layer_norm_rms_epsilon" (e.g., "llama.attention.layer_norm_rms_epsilon")
|
||||
│ │
|
||||
│ ├── Tokenizer parameters
|
||||
│ │ ├── "tokenizer.ggml.model" (e.g., "llama")
|
||||
│ │ ├── "tokenizer.ggml.tokens" (vocabulary tokens)
|
||||
│ │ ├── "tokenizer.ggml.bos_id" (beginning of sequence token ID)
|
||||
│ │ └── "tokenizer.ggml.eos_id" (end of sequence token ID)
|
||||
│ │
|
||||
│ └── General metadata
|
||||
│ └── "general.architecture" (e.g., "llama", "qwen2", "phi")
|
||||
│
|
||||
└── Tensor Data Section
|
||||
├── Common tensors:
|
||||
│ ├── "token_embd.weight" (token embedding matrix)
|
||||
│ ├── "rope_freqs.weight" (RoPE frequency weights)
|
||||
│ ├── "output_norm.weight" (final layer normalization)
|
||||
│ └── "output.weight" (output projection)
|
||||
│
|
||||
└── Layer-specific tensors:
|
||||
├── "blk.{i}.attn_q.weight" (query projection)
|
||||
├── "blk.{i}.attn_k.weight" (key projection)
|
||||
├── "blk.{i}.attn_v.weight" (value projection)
|
||||
├── "blk.{i}.attn_output.weight" (attention output)
|
||||
├── "blk.{i}.attn_norm.weight" (attention normalization)
|
||||
├── "blk.{i}.ffn_norm.weight" (feed-forward normalization)
|
||||
├── "blk.{i}.ffn_up.weight" (FFN up projection)
|
||||
├── "blk.{i}.ffn_down.weight" (FFN down projection)
|
||||
└── "blk.{i}.ffn_gate.weight" (FFN gate projection)
|
||||
```
|
||||
|
||||
- Key conversion details include:
|
||||
- Linear weight matrices (sometimes need transposition)
|
||||
- Layer normalization weights (might need reshaping)
|
||||
- **Note: In GGML, FFN values are for the MLP (Multi-Layer Perceptron) part of the architecture**
|
||||
|
||||
- Test conversion:
|
||||
```bash
|
||||
go run . create <my-model> -f /path/to/Modelfile
|
||||
```
|
||||
|
||||
### 6. Open a Draft PR
|
||||
|
||||
After implementing the initial weight conversion, creating a draft pull request is recommended as it:
|
||||
- Establishes a communication channel with Ollama maintainers
|
||||
- Allows for early feedback on your approach
|
||||
- Makes it easier to track progress and changes
|
||||
|
||||
To open a draft PR:
|
||||
1. Fork the repository
|
||||
2. Create a new branch for your model implementation
|
||||
3. Make initial commits with your weight conversion implementation
|
||||
4. Open a PR in the `ollama/ollama` repository and mark it as draft
|
||||
5. Include a clear description of the model you're implementing
|
||||
|
||||
### 7. Implement Model Logic
|
||||
|
||||
- Reference existing model implementations
|
||||
- Implement `New()` and `Forward()` functions in `model.go`:
|
||||
|
||||
**The `New()` function:**
|
||||
- Creates and initializes your model structure
|
||||
- Loads configuration parameters (embedding size, attention heads, etc.)
|
||||
- Sets up the tokenizer with vocabulary and special tokens
|
||||
- Initializes all model layers and weights
|
||||
- **Important**: Sets up the KV cache for efficient inference
|
||||
- Example:
|
||||
```go
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
m := &Model{
|
||||
// Initialize tokenizer
|
||||
BytePairEncoding: model.NewBytePairEncoding(...),
|
||||
// Create layer arrays
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
// Set model parameters
|
||||
Options: &Options{...},
|
||||
}
|
||||
// Initialize KV cache for efficient inference
|
||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
||||
return m, nil
|
||||
}
|
||||
```
|
||||
|
||||
**The `Forward()` function:**
|
||||
- **What it does**: Defines the computational graph of your model
|
||||
- **Important**: The graph is NOT executed immediately - it's built first, then executed later when predictions are needed
|
||||
- Takes input tokens and converts them to embeddings
|
||||
- Processes inputs through transformer layers (attention and feed-forward networks)
|
||||
- Creates the path for data flow through your model's components
|
||||
- Example:
|
||||
```go
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
// Convert inputs to tensors
|
||||
inputTensor, _ := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
|
||||
positionsTensor, _ := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
|
||||
|
||||
// Initial token embedding
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, inputTensor)
|
||||
|
||||
// Process through transformer layers
|
||||
for i, layer := range m.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positionsTensor, m.Cache, m.Options)
|
||||
}
|
||||
|
||||
// Final processing and output
|
||||
normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)
|
||||
logits := m.Output.Forward(ctx, normalizedOutput)
|
||||
|
||||
// Return logits for requested positions
|
||||
outputsTensor, _ := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
|
||||
return logits.Rows(ctx, outputsTensor), nil
|
||||
}
|
||||
```
|
||||
|
||||
**Key Components to Implement:**
|
||||
|
||||
1. **KV Cache**:
|
||||
- Improves inference performance for text generation
|
||||
- How it works: Stores previously computed key and value tensors from self-attention, avoiding redundant computations
|
||||
- Implementation: Use the `kvcache.NewCausalCache()` for autoregressive models
|
||||
- Important: Must implement the `Shift()` function to handle rotary position embeddings with the cache
|
||||
|
||||
2. **Self-Attention**:
|
||||
- Core component that learns contextual relationships between tokens
|
||||
- Implements query, key, value projections and their interactions
|
||||
- Must handle positional encoding (usually Rotary Position Embeddings)
|
||||
- Uses the KV cache to make generation efficient
|
||||
|
||||
3. **Normalization Layers**:
|
||||
- Purpose: Stabilizes training and maintains consistent activation distributions
|
||||
- Types: RMSNorm, LayerNorm, etc. depending on model architecture
|
||||
- Implementation: Apply before attention and feed-forward networks
|
||||
- Example: `normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)`
|
||||
|
||||
4. **Activation Functions**:
|
||||
- Purpose: Introduces non-linearity into the model
|
||||
- Common types: SILU (Sigmoid Linear Unit), GELU, ReLU
|
||||
- Found in feed-forward/MLP blocks
|
||||
- Example:
|
||||
```go
|
||||
// SwiGLU activation in MLP
|
||||
gateActivation := mlp.Gate.Forward(ctx, hiddenState).SILU(ctx)
|
||||
upProjection := mlp.Up.Forward(ctx, hiddenState)
|
||||
intermediateStates := gateActivation.Mul(ctx, upProjection)
|
||||
```
|
||||
- Run your forward pass:
|
||||
```bash
|
||||
# in the root of the ollama directory
|
||||
go build .
|
||||
OLLAMA_DEBUG=1 ./ollama serve
|
||||
OLLAMA_DEBUG=1 ./ollama run <my-model>
|
||||
```
|
||||
- Compare output with research implementation
|
||||
|
||||
### 8. Quality Check and Final Steps
|
||||
|
||||
1. Add comprehensive tests to:
|
||||
- `model_test.go`
|
||||
- `convert_test.go`
|
||||
|
||||
2. Ensure tests cover:
|
||||
- Weight conversion
|
||||
- Model initialization
|
||||
- Text generation
|
||||
|
||||
3. **Create Final Modelfile**
|
||||
- Replace the static prompt with the proper Go template for your model:
|
||||
```
|
||||
FROM <converted-gguf>
|
||||
TEMPLATE <prompt-template> # Add the proper Go template for your model, including tools if needed
|
||||
LICENSE <license-info> # Add appropriate license information
|
||||
# Add additional parameters if needed
|
||||
```
|
||||
|
||||
4. **End-to-end Testing**
|
||||
- Run your model with your local Ollama build to ensure that it functions as expected
|
||||
|
||||
5. Benchmark
|
||||
- Run performance benchmarks on your model implementation
|
||||
```go
|
||||
# from the root of the Ollama directory, while a server is running locally
|
||||
go build .
|
||||
OLLAMA_DEBUG=1 ./ollama serve
|
||||
go test -bench=. -m <your-model-name> ./...
|
||||
```
|
||||
|
||||
### 9. Finalize PR and Publish to ollama.com
|
||||
|
||||
1. **Finalize Pull Request**
|
||||
- Move PR out of draft state
|
||||
- Address reviewer feedback
|
||||
|
||||
2. **Publish to ollama.com**
|
||||
- Push to ollama.com:
|
||||
```bash
|
||||
ollama create <your-namespace>/<your-model> -f /path/to/Modelfile
|
||||
ollama push <your-namespace>/<your-model>
|
||||
```
|
||||
@@ -167,6 +167,8 @@ var (
|
||||
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
||||
// Enable the new Ollama engine
|
||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||
// Ollama is running in a benchmark context, additional timing data will be collected.
|
||||
Benchmark = Bool("OLLAMA_BENCHMARK")
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
|
||||
@@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
|
||||
return 10
|
||||
}
|
||||
|
||||
func (c *testContext) Timing() []ml.OpTiming {
|
||||
return []ml.OpTiming{}
|
||||
}
|
||||
|
||||
func (c *testContext) Close() {}
|
||||
|
||||
type testTensor struct {
|
||||
|
||||
24
llama/patches/0018-remove-amx.patch
Normal file
24
llama/patches/0018-remove-amx.patch
Normal file
@@ -0,0 +1,24 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Tue, 18 Feb 2025 14:47:21 -0800
|
||||
Subject: [PATCH] remove amx
|
||||
|
||||
---
|
||||
ggml/src/CMakeLists.txt | 4 ----
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 72b488dd..50828717 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
||||
- if (NOT MSVC)
|
||||
- # MSVC doesn't support AMX
|
||||
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
- endif()
|
||||
else ()
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
endif()
|
||||
@@ -2,6 +2,7 @@ package ml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"cmp"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
|
||||
}
|
||||
|
||||
func NewBackend(f *os.File) (Backend, error) {
|
||||
if backend, ok := backends["ggml"]; ok {
|
||||
if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
|
||||
return backend(f)
|
||||
}
|
||||
|
||||
@@ -53,6 +54,30 @@ type Context interface {
|
||||
Compute(...Tensor)
|
||||
MaxTensors() int
|
||||
Close()
|
||||
|
||||
Timing() []OpTiming
|
||||
}
|
||||
|
||||
// OpType is the type of operation performed during a forward pass.
|
||||
type OpType string
|
||||
|
||||
const (
|
||||
View OpType = "View"
|
||||
Copy OpType = "Copy"
|
||||
Reshape OpType = "Reshape"
|
||||
Permute OpType = "Permute"
|
||||
Contiguous OpType = "Contiguous"
|
||||
Input OpType = "Input"
|
||||
ComputeOp OpType = "Compute"
|
||||
Transpose OpType = "Transpose"
|
||||
)
|
||||
|
||||
// OpTiming stores the timing information for a single operation.
|
||||
type OpTiming struct {
|
||||
Type OpType
|
||||
Operation string
|
||||
Duration float64
|
||||
Order int
|
||||
}
|
||||
|
||||
type Tensor interface {
|
||||
|
||||
@@ -4,6 +4,8 @@ package ggml
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
@@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
|
||||
#endif
|
||||
}
|
||||
|
||||
// Define a fixed-size struct to store timing data
|
||||
#define MAX_TENSOR_NAME 256
|
||||
#define MAX_TIMINGS 1000
|
||||
|
||||
typedef struct {
|
||||
char tensor_name[MAX_TENSOR_NAME];
|
||||
double duration_ms;
|
||||
} timing_entry;
|
||||
|
||||
typedef struct {
|
||||
timing_entry entries[MAX_TIMINGS];
|
||||
int count;
|
||||
} timing_data;
|
||||
|
||||
// Global timing data structure
|
||||
timing_data g_timings = {0};
|
||||
|
||||
double get_time_ms() {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
|
||||
}
|
||||
|
||||
bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
static double start_time;
|
||||
static char current_tensor[MAX_TENSOR_NAME];
|
||||
|
||||
if (ask) {
|
||||
start_time = get_time_ms();
|
||||
strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
|
||||
current_tensor[MAX_TENSOR_NAME - 1] = '\0';
|
||||
} else {
|
||||
double end_time = get_time_ms();
|
||||
double duration = end_time - start_time;
|
||||
|
||||
if (g_timings.count < MAX_TIMINGS) {
|
||||
strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
|
||||
g_timings.entries[g_timings.count].duration_ms = duration;
|
||||
g_timings.count++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void clear_timings() {
|
||||
g_timings.count = 0;
|
||||
}
|
||||
|
||||
*/
|
||||
import "C"
|
||||
|
||||
@@ -29,9 +79,11 @@ import (
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
fs "github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/ml"
|
||||
@@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
|
||||
C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
|
||||
}
|
||||
|
||||
// Timing retrieves the collected timing data
|
||||
func (c *Context) Timing() []ml.OpTiming {
|
||||
sequence := make([]ml.OpTiming, C.g_timings.count)
|
||||
|
||||
for i := range int(C.g_timings.count) {
|
||||
entry := C.g_timings.entries[i]
|
||||
tensorName := C.GoString(&entry.tensor_name[0])
|
||||
|
||||
// Determine operation type and description based on tensor name
|
||||
var opType ml.OpType
|
||||
var opDesc string
|
||||
|
||||
switch {
|
||||
case strings.Contains(tensorName, "(view)"):
|
||||
opType, opDesc = ml.View, "Memory view"
|
||||
case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
|
||||
opType, opDesc = ml.Copy, "Memory copy"
|
||||
case strings.Contains(tensorName, "(reshaped)"):
|
||||
opType, opDesc = ml.Reshape, "Reshape"
|
||||
case strings.Contains(tensorName, "(permuted)"):
|
||||
opType, opDesc = ml.Permute, "Permute dimensions"
|
||||
case strings.Contains(tensorName, "(cont)"):
|
||||
opType, opDesc = ml.Contiguous, "Make contiguous"
|
||||
case strings.Contains(tensorName, "(transposed)"):
|
||||
opType, opDesc = ml.Transpose, "Transpose"
|
||||
case strings.HasPrefix(tensorName, "leaf_"):
|
||||
opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
|
||||
case strings.HasPrefix(tensorName, "node_"):
|
||||
opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
|
||||
default:
|
||||
opType, opDesc = "Unknown", tensorName
|
||||
}
|
||||
|
||||
sequence[i] = ml.OpTiming{
|
||||
Type: opType,
|
||||
Operation: opDesc,
|
||||
Duration: float64(entry.duration_ms),
|
||||
Order: i,
|
||||
}
|
||||
}
|
||||
|
||||
return sequence
|
||||
}
|
||||
|
||||
func (c *Context) Compute(tensors ...ml.Tensor) {
|
||||
if envconfig.Benchmark() {
|
||||
// Clear previous timings before new computation
|
||||
C.clear_timings()
|
||||
|
||||
C.ggml_backend_sched_set_eval_callback(
|
||||
c.sched,
|
||||
C.ggml_backend_eval_callback(C.debug_callback),
|
||||
nil,
|
||||
)
|
||||
}
|
||||
|
||||
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
|
||||
|
||||
needSync := true
|
||||
|
||||
4
ml/backend/ggml/ggml/src/CMakeLists.txt
vendored
4
ml/backend/ggml/ggml/src/CMakeLists.txt
vendored
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
else ()
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
endif()
|
||||
|
||||
Reference in New Issue
Block a user