Compare commits

..

3 Commits

Author SHA1 Message Date
Michael Yang
69f3dfdedf update tests 2025-08-14 15:04:26 -07:00
Michael Yang
7bd3f0269c drop float16 dependency
goos: darwin
goarch: arm64
pkg: github.com/ollama/ollama/convert/float16
cpu: Apple M3 Max
BenchmarkFloat16/x448/float16-16                     159           7398462 ns/op
BenchmarkFloat16/simple-16                           512           2327098 ns/op
PASS
ok      github.com/ollama/ollama/convert/float16        2.553s
2025-08-14 15:04:26 -07:00
Michael Yang
276c4df770 drop bfloat16 dependency
goos: darwin
goarch: arm64
pkg: github.com/ollama/ollama/convert/bfloat16
cpu: Apple M3 Max
BenchmarkBfloat16/d4l3k/go-bfloat16-16               516           2269453 ns/op
BenchmarkBfloat16/simple-16                          1759            626316 ns/op
PASS
ok      github.com/ollama/ollama/convert/bfloat16        2.502s
2025-08-14 15:04:26 -07:00
50 changed files with 1450 additions and 2784 deletions

View File

@@ -22,7 +22,7 @@
"name": "CUDA 12",
"inherits": [ "CUDA" ],
"cacheVariables": {
"CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
"CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;120-virtual",
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
}
},
@@ -30,14 +30,14 @@
"name": "JetPack 5",
"inherits": [ "CUDA" ],
"cacheVariables": {
"CMAKE_CUDA_ARCHITECTURES": "72;87"
"CMAKE_CUDA_ARCHITECTURES": "72-virtual;87-virtual"
}
},
{
"name": "JetPack 6",
"inherits": [ "CUDA" ],
"cacheVariables": {
"CMAKE_CUDA_ARCHITECTURES": "87"
"CMAKE_CUDA_ARCHITECTURES": "87-virtual"
}
},
{

View File

@@ -86,8 +86,6 @@ RUN go mod download
COPY . .
ARG GOFLAGS="'-ldflags=-w -s'"
ENV CGO_ENABLED=1
ARG CGO_CFLAGS
ARG CGO_CXXFLAGS
RUN --mount=type=cache,target=/root/.cache/go-build \
go build -trimpath -buildmode=pie -o /bin/ollama .

View File

@@ -411,8 +411,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
### Cloud
@@ -539,8 +537,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
- [Ollama for D](https://github.com/kassane/ollama-d)
- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
### Mobile

View File

@@ -90,10 +90,6 @@ type GenerateRequest struct {
// (request that thinking _not_ be used) and unset (use the old behavior
// before this option was introduced)
Think *ThinkValue `json:"think,omitempty"`
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
// template instead of calling the model.
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
}
// ChatRequest describes a request sent by [Client.Chat].
@@ -124,10 +120,6 @@ type ChatRequest struct {
// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
// for supported models.
Think *ThinkValue `json:"think,omitempty"`
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
// template instead of calling the model.
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
}
type Tools []Tool
@@ -316,19 +308,6 @@ type ChatResponse struct {
Metrics
}
// DebugInfo contains debug information for template rendering
type DebugInfo struct {
RenderedTemplate string `json:"rendered_template"`
ImageCount int `json:"image_count,omitempty"`
}
// DebugTemplateResponse is returned when _debug_render_only is set to true
type DebugTemplateResponse struct {
Model string `json:"model"`
CreatedAt time.Time `json:"created_at"`
DebugInfo DebugInfo `json:"_debug_info"`
}
type Metrics struct {
TotalDuration time.Duration `json:"total_duration,omitempty"`
LoadDuration time.Duration `json:"load_duration,omitempty"`

View File

@@ -1612,7 +1612,6 @@ func NewCLI() *cobra.Command {
appendEnvDocs(cmd, []envconfig.EnvVar{
envVars["OLLAMA_DEBUG"],
envVars["OLLAMA_HOST"],
envVars["OLLAMA_CONTEXT_LENGTH"],
envVars["OLLAMA_KEEP_ALIVE"],
envVars["OLLAMA_MAX_LOADED_MODELS"],
envVars["OLLAMA_MAX_QUEUE"],

View File

@@ -0,0 +1,21 @@
package bfloat16
import "math"
// FromFloat32s converts a slice of float32 values to a slice of bfloat16 values, represented as uint16s.
func FromFloat32s(f32s []float32) (u16s []uint16) {
u16s = make([]uint16, len(f32s))
for i := range f32s {
u16s[i] = uint16(math.Float32bits(f32s[i]) >> 16)
}
return u16s
}
// Float32s converts a slice of bfloat16 values, represented as uint16s, back to a slice of float32 values.
func Float32s(u16s []uint16) (f32s []float32) {
f32s = make([]float32, len(u16s))
for i := range u16s {
f32s[i] = math.Float32frombits(uint32(u16s[i]) << 16)
}
return f32s
}

View File

@@ -0,0 +1,82 @@
package bfloat16
import (
"math"
"math/rand/v2"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestBfloat16(t *testing.T) {
cases := []struct {
name string
input uint16
want uint32
}{
// Zero cases
{"positive zero", 0x0000, 0x0},
{"negative zero", 0x8000, 0x80000000},
// Normal numbers
{"one", 0x3F80, 0x3F800000},
{"negative one", 0xBF80, 0xBF800000},
{"two", 0x4000, 0x40000000},
{"half", 0x3F00, 0x3F000000},
{"quarter", 0x3E80, 0x3E800000},
{"max finite", 0x7F7F, 0x7F7F0000},
{"min positive normal", 0x0080, 0x00800000},
// Infinity cases
{"positive infinity", 0x7F80, 0x7F800000},
{"negative infinity", 0xFF80, 0xFF800000},
// NaN cases
{"NaN", 0x7FC0, 0x7FC00000},
{"NaN with payload", 0x7FC1, 0x7FC10000},
// Subnormal cases
{"min positive subnormal", 0x0001, 0x00010000},
{"max subnormal", 0x007F, 0x007F0000},
// Powers of 2
{"2^10", 0x4480, 0x44800000},
{"2^-10", 0x3A80, 0x3A800000},
{"2^20", 0x4B80, 0x4B800000},
// Common approximations in BF16
{"pi approximation", 0x4049, 0x40490000},
{"e approximation", 0x402E, 0x402E0000},
{"sqrt(2) approximation", 0x3FB5, 0x3FB50000},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
t.Run("Float32s", func(t *testing.T) {
got := Float32s([]uint16{tt.input})[0]
if diff := cmp.Diff(tt.want, math.Float32bits(got)); diff != "" {
t.Errorf("Float32s mismatch (-want +got):\n%s", diff)
}
})
t.Run("FromFloat32s", func(t *testing.T) {
got := FromFloat32s([]float32{math.Float32frombits(tt.want)})
if diff := cmp.Diff([]uint16{tt.input}, got); diff != "" {
t.Errorf("FromFloat32s mismatch (-want +got):\n%s", diff)
}
})
})
}
}
func BenchmarkBfloat16(b *testing.B) {
f32s := make([]float32, 1_000_000)
for i := range f32s {
f32s[i] = rand.Float32()
}
for b.Loop() {
Float32s(FromFloat32s(f32s))
}
}

View File

@@ -0,0 +1,97 @@
package float16
import (
"math"
)
func FromFloat32s(f32s []float32) (u16s []uint16) {
u16s = make([]uint16, len(f32s))
for i := range f32s {
bits := math.Float32bits(f32s[i])
sign := (bits >> 31) & 0x1
exponent := (bits >> 23) & 0xFF
mantissa := bits & 0x7FFFFF
if exponent == 0xFF {
if mantissa == 0 {
// Infinity
u16s[i] = uint16((sign << 15) | 0x7C00)
} else {
// NaN
u16s[i] = uint16((sign << 15) | 0x7C00 | (mantissa >> 13))
}
} else if exponent == 0 && mantissa == 0 {
// Zero
u16s[i] = uint16(sign << 15)
} else {
// Convert exponent from FP32 bias (127) to FP16 bias (15)
exponent := int(exponent) - 127 + 15
if exponent >= 31 {
// Overflow to infinity
u16s[i] = uint16((sign << 15) | 0x7C00)
} else if exponent <= 0 {
// Underflow - create subnormal or zero
if exponent < -10 {
u16s[i] = uint16(sign << 15) // Zero
} else {
// Subnormal number
mantissa = (mantissa | 0x800000) >> uint(-exponent+1)
u16s[i] = uint16((sign << 15) | (mantissa >> 13))
}
} else {
// Normal number - truncate mantissa from 23 to 10 bits
u16s[i] = uint16((sign << 15) | (uint32(exponent) << 10) | (mantissa >> 13))
}
}
}
return u16s
}
func Float32s(u16s []uint16) (f32s []float32) {
f32s = make([]float32, len(u16s))
for i := range u16s {
sign := (u16s[i] >> 15) & 0x1
exponent := (u16s[i] >> 10) & 0x1F
mantissa := u16s[i] & 0x3FF
var u32 uint32
switch exponent {
case 0:
if mantissa == 0 {
// Zero
u32 = uint32(sign) << 31
} else {
// Subnormal - convert to normal
// Find leading 1 bit
shift := 0
temp := mantissa
for temp&0x400 == 0 {
temp <<= 1
shift++
}
exponent := 127 - 15 + 1 - shift
mantissa := (uint32(temp&0x3FF) << 13)
u32 = (uint32(sign) << 31) | (uint32(exponent) << 23) | mantissa
}
case 0x1F:
if mantissa == 0 {
// Infinity
u32 = (uint32(sign) << 31) | 0x7F800000
} else {
// NaN
u32 = (uint32(sign) << 31) | 0x7F800000 | (uint32(mantissa) << 13)
}
default:
// Normal number
exponent := uint32(exponent) - 15 + 127
mantissa := uint32(mantissa) << 13
u32 = (uint32(sign) << 31) | (exponent << 23) | mantissa
}
f32s[i] = math.Float32frombits(u32)
}
return f32s
}

View File

@@ -0,0 +1,75 @@
package float16
import (
"math"
"math/rand/v2"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestFloat16(t *testing.T) {
cases := []struct {
name string
input uint16
want uint32
}{
// Zero cases
{"positive zero", 0x0000, 0x0},
{"negative zero", 0x8000, 0x80000000},
// Normal numbers
{"one", 0x3C00, 0x3F800000},
{"negative one", 0xBC00, 0xBF800000},
{"two", 0x4000, 0x40000000},
{"half", 0x3800, 0x3F000000},
{"max normal", 0x7BFF, 0x477fe000},
{"min positive normal", 0x0400, 0x38800000},
// Infinity cases
{"positive infinity", 0x7C00, 0x7F800000},
{"negative infinity", 0xFC00, 0xFF800000},
// NaN cases
{"NaN", 0x7C01, 0x7f802000},
{"NaN with payload", 0x7E00, 0x7FC00000},
// Subnormal cases
{"min positive subnormal", 0x0001, 0x33800000},
{"max subnormal", 0x03FF, 0x387fc000},
// Common values
{"pi approximation", 0x4248, 0x40490000},
{"e approximation", 0x416F, 0x402de000},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
t.Run("Float32s", func(t *testing.T) {
got := Float32s([]uint16{tt.input})[0]
if diff := cmp.Diff(tt.want, math.Float32bits(got)); diff != "" {
t.Errorf("Float32s mismatch (-want +got):\n%s", diff)
}
})
t.Run("FromFloat32s", func(t *testing.T) {
got := FromFloat32s([]float32{math.Float32frombits(tt.want)})
if diff := cmp.Diff([]uint16{tt.input}, got); diff != "" {
t.Errorf("FromFloat32s mismatch (-want +got):\n%s", diff)
}
})
})
}
}
func BenchmarkFloat16(b *testing.B) {
f32s := make([]float32, 1_000_000)
for i := range f32s {
f32s[i] = rand.Float32()
}
for b.Loop() {
Float32s(FromFloat32s(f32s))
}
}

View File

@@ -13,8 +13,8 @@ import (
"slices"
"strings"
"github.com/d4l3k/go-bfloat16"
"github.com/x448/float16"
"github.com/ollama/ollama/convert/bfloat16"
"github.com/ollama/ollama/convert/float16"
)
type safetensorMetadata struct {
@@ -163,18 +163,16 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
return 0, err
}
f32s = make([]float32, len(u16s))
for i := range u16s {
f32s[i] = float16.Frombits(u16s[i]).Float32()
}
f32s = float16.Float32s(u16s)
case "BF16":
u8s := make([]uint8, st.size)
if err = binary.Read(br, binary.LittleEndian, u8s); err != nil {
u16s := make([]uint16, st.size/2)
if err = binary.Read(br, binary.LittleEndian, u16s); err != nil {
return 0, err
}
f32s = bfloat16.DecodeFloat32(u8s)
f32s = bfloat16.Float32s(u16s)
default:
return 0, fmt.Errorf("unknown data type: %s", st.dtype)
}
@@ -190,15 +188,9 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
case tensorKindFP32:
return 0, binary.Write(w, binary.LittleEndian, f32s)
case tensorKindFP16:
f16s := make([]uint16, len(f32s))
for i := range f32s {
f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
}
return 0, binary.Write(w, binary.LittleEndian, f16s)
return 0, binary.Write(w, binary.LittleEndian, float16.FromFloat32s(f32s))
case tensorKindBF16:
u8s := bfloat16.EncodeFloat32(f32s)
return 0, binary.Write(w, binary.LittleEndian, u8s)
return 0, binary.Write(w, binary.LittleEndian, bfloat16.FromFloat32s(f32s))
default:
return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
}

View File

@@ -7,9 +7,9 @@ import (
"path/filepath"
"testing"
"github.com/d4l3k/go-bfloat16"
"github.com/google/go-cmp/cmp"
"github.com/x448/float16"
"github.com/ollama/ollama/convert/bfloat16"
"github.com/ollama/ollama/convert/float16"
)
func TestSafetensors(t *testing.T) {
@@ -21,6 +21,11 @@ func TestSafetensors(t *testing.T) {
}
defer root.Close()
f32s := make([]float32, 32)
for i := range f32s {
f32s[i] = float32(i)
}
cases := []struct {
name,
dtype string
@@ -36,11 +41,6 @@ func TestSafetensors(t *testing.T) {
size: 32 * 4, // 32 floats, each 4 bytes
shape: []uint64{32},
setup: func(t *testing.T, f *os.File) {
f32s := make([]float32, 32)
for i := range f32s {
f32s[i] = float32(i)
}
if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
t.Fatal(err)
}
@@ -62,11 +62,6 @@ func TestSafetensors(t *testing.T) {
size: 32 * 4, // 32 floats, each 4 bytes
shape: []uint64{16, 2},
setup: func(t *testing.T, f *os.File) {
f32s := make([]float32, 32)
for i := range f32s {
f32s[i] = float32(i)
}
if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
t.Fatal(err)
}
@@ -84,12 +79,7 @@ func TestSafetensors(t *testing.T) {
size: 32 * 2, // 32 floats, each 2 bytes
shape: []uint64{16, 2},
setup: func(t *testing.T, f *os.File) {
u16s := make([]uint16, 32)
for i := range u16s {
u16s[i] = float16.Fromfloat32(float32(i)).Bits()
}
if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
if err := binary.Write(f, binary.LittleEndian, float16.FromFloat32s(f32s)); err != nil {
t.Fatal(err)
}
},
@@ -106,12 +96,7 @@ func TestSafetensors(t *testing.T) {
size: 32 * 2, // 32 floats, each 2 bytes
shape: []uint64{32},
setup: func(t *testing.T, f *os.File) {
u16s := make([]uint16, 32)
for i := range u16s {
u16s[i] = float16.Fromfloat32(float32(i)).Bits()
}
if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
if err := binary.Write(f, binary.LittleEndian, float16.FromFloat32s(f32s)); err != nil {
t.Fatal(err)
}
},
@@ -132,12 +117,7 @@ func TestSafetensors(t *testing.T) {
size: 32 * 2, // 32 brain floats, each 2 bytes
shape: []uint64{16, 2},
setup: func(t *testing.T, f *os.File) {
f32s := make([]float32, 32)
for i := range f32s {
f32s[i] = float32(i)
}
if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
if err := binary.Write(f, binary.LittleEndian, bfloat16.FromFloat32s(f32s)); err != nil {
t.Fatal(err)
}
},
@@ -154,12 +134,7 @@ func TestSafetensors(t *testing.T) {
size: 32 * 2, // 32 brain floats, each 2 bytes
shape: []uint64{32},
setup: func(t *testing.T, f *os.File) {
f32s := make([]float32, 32)
for i := range f32s {
f32s[i] = float32(i)
}
if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
if err := binary.Write(f, binary.LittleEndian, bfloat16.FromFloat32s(f32s)); err != nil {
t.Fatal(err)
}
},

View File

@@ -97,7 +97,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
return a < b
})
gpuCount := 0
gpuOrdinalID := 0
for _, match := range matches {
slog.Debug("evaluating amdgpu node " + match)
fp, err := os.Open(match)
@@ -188,6 +187,10 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
continue
}
// Keep track of numeric IDs based on valid GPUs
gpuID := gpuCount
gpuCount += 1
// Look up the memory for the current node
totalMemory := uint64(0)
usedMemory := uint64(0)
@@ -266,7 +269,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
if uniqueID != 0 {
ID = fmt.Sprintf("GPU-%016x", uniqueID)
} else {
ID = strconv.Itoa(gpuOrdinalID)
ID = strconv.Itoa(gpuID)
}
gpuInfo := RocmGPUInfo{
@@ -284,40 +287,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
DriverMinor: driverMinor,
},
usedFilepath: usedFile,
index: gpuCount,
index: gpuID,
}
// Keep track of numeric IDs based on valid GPUs
gpuCount += 1
// If the user wants to filter to a subset of devices, filter out if we aren't a match
if len(visibleDevices) > 0 {
include := false
for _, visible := range visibleDevices {
if (uniqueID != 0 && visible == gpuInfo.ID) || visible == strconv.Itoa(gpuInfo.index) {
include = true
break
}
}
if !include {
reason := "filtering out device per user request"
slog.Info(reason, "id", gpuInfo.ID, "index", gpuInfo.index, "visible_devices", visibleDevices)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
})
continue
}
}
// Ordinal IDs are based on the visible GPUs
gpuOrdinalID += 1
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
if totalMemory < IGPUMemLimit {
reason := "unsupported Radeon iGPU detected skipping"
slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
@@ -330,7 +306,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
}
if int(major) < minVer {
reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
slog.Warn(reason, "gpu", gpuInfo.ID)
slog.Warn(reason, "gpu", gpuID)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
@@ -339,8 +315,29 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
continue
}
slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "available", format.HumanBytes2(totalMemory-usedMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
// If the user wants to filter to a subset of devices, filter out if we aren't a match
if len(visibleDevices) > 0 {
include := false
for _, visible := range visibleDevices {
if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
include = true
break
}
}
if !include {
reason := "filtering out device per user request"
slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
})
continue
}
}
// Final validation is gfx compatibility - load the library if we haven't already loaded it
// even if the user overrides, we still need to validate the library

View File

@@ -75,7 +75,7 @@ for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
import { Ollama } from 'ollama';
const ollama = new Ollama({
host: 'https://ollama.com',
host: 'https://ollama.com'
headers: {
Authorization: "Bearer <api key>"
}

View File

@@ -185,8 +185,6 @@ var (
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
// Auth enables authentication between the Ollama client and server
UseAuth = Bool("OLLAMA_AUTH")
// Enable the new memory estimation logic
NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
)
func String(s string) func() string {
@@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar {
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
"OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},
// Informational
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},

View File

@@ -480,8 +480,6 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
}
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
context *= uint64(numParallel)
embedding := f.KV().EmbeddingLength()
heads := f.KV().HeadCountMax()
headsKV := f.KV().HeadCountKVMax()
@@ -752,11 +750,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
// SupportsKVCacheType checks if the requested cache type is supported
func (f GGML) SupportsKVCacheType(cacheType string) bool {
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
// gpt-oss uses attention with sinks which does not support quantized cache types
slog.Warn("model only supports non-quantized cache types ", "mode", arch)
return cacheType == "f16"
}
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
}

2
go.mod
View File

@@ -10,13 +10,11 @@ require (
github.com/olekukonko/tablewriter v0.0.5
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.9.0
github.com/x448/float16 v0.8.4
golang.org/x/sync v0.12.0
)
require (
github.com/agnivade/levenshtein v1.1.1
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
github.com/dlclark/regexp2 v1.11.4
github.com/emirpasic/gods/v2 v2.0.0-alpha
github.com/google/go-cmp v0.7.0

4
go.sum
View File

@@ -35,8 +35,6 @@ github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARu
github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -197,8 +195,6 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

View File

@@ -7,7 +7,6 @@ import (
"fmt"
"log/slog"
"math"
"math/rand"
"os"
"strconv"
"sync"
@@ -17,157 +16,245 @@ import (
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
// Send multiple requests in parallel (concurrently) to a single model and ensure responses are expected
func TestConcurrentGenerate(t *testing.T) {
// Assumes all requests have the same model
req, resp := GenerateRequests()
numParallel := int(envconfig.NumParallel() + 1)
iterLimit := 3
func TestMultiModelConcurrency(t *testing.T) {
var (
req = [2]api.GenerateRequest{
{
Model: smol,
Prompt: "why is the ocean blue?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "qwen3:0.6b",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
},
}
resp = [2][]string{
{"sunlight"},
{"england", "english", "massachusetts", "pilgrims", "british", "festival"},
}
)
var wg sync.WaitGroup
wg.Add(len(req))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
defer cancel()
softTimeout, hardTimeout := getTimeouts(t)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
for i := 0; i < len(req); i++ {
require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
}
for i := 0; i < len(req); i++ {
go func(i int) {
defer wg.Done()
// Note: CPU based inference can crawl so don't give up too quickly
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
}(i)
}
wg.Wait()
}
func TestIntegrationConcurrentPredict(t *testing.T) {
req, resp := GenerateRequests()
reqLimit := len(req)
iterLimit := 5
if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
maxVram, err := strconv.ParseUint(s, 10, 64)
require.NoError(t, err)
// Don't hammer on small VRAM cards...
if maxVram < 4*format.GibiByte {
reqLimit = min(reqLimit, 2)
iterLimit = 2
}
}
ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
// Get the server running (if applicable) warm the model up with a single initial request
slog.Info("loading", "model", req[0].Model)
err := client.Generate(ctx,
&api.GenerateRequest{Model: req[0].Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
func(response api.GenerateResponse) error { return nil },
)
if err != nil {
t.Fatalf("failed to load model %s: %s", req[0].Model, err)
}
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
var wg sync.WaitGroup
r := rand.New(rand.NewSource(0))
wg.Add(numParallel)
for i := range numParallel {
wg.Add(reqLimit)
for i := 0; i < reqLimit; i++ {
go func(i int) {
defer wg.Done()
for j := 0; j < iterLimit; j++ {
if time.Now().Sub(started) > softTimeout {
slog.Info("exceeded soft timeout, winding down test")
return
}
k := r.Int() % len(req)
slog.Info("Starting", "thread", i, "iter", j)
slog.Info("Starting", "req", i, "iter", j)
// On slower GPUs it can take a while to process the concurrent requests
// so we allow a much longer initial timeout
DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
}
}(i)
}
wg.Wait()
}
// Stress the scheduler and attempt to load more models than will fit to cause thrashing
// This test will always load at least 2 models even on CPU based systems
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
func TestMultiModelStress(t *testing.T) {
s := os.Getenv("OLLAMA_MAX_VRAM")
s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
if s == "" {
s = "0"
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
}
maxVram, err := strconv.ParseUint(s, 10, 64)
if err != nil {
t.Fatal(err)
}
smallModels := []string{
"llama3.2:1b",
"qwen3:0.6b",
"gemma:2b",
"deepseek-r1:1.5b",
"starcoder2:3b",
}
mediumModels := []string{
"qwen3:8b",
"llama2",
"deepseek-r1:7b",
"mistral",
"dolphin-mistral",
"gemma:7b",
"codellama:7b",
if maxVram < 2*format.GibiByte {
t.Skip("VRAM less than 2G, skipping model stress tests")
}
var chosenModels []string
type model struct {
name string
size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
}
smallModels := []model{
{
name: "llama3.2:1b",
size: 2876 * format.MebiByte,
},
{
name: "qwen3:0.6b",
size: 1600 * format.MebiByte,
},
{
name: "gemma:2b",
size: 2364 * format.MebiByte,
},
{
name: "deepseek-r1:1.5b",
size: 2048 * format.MebiByte,
},
{
name: "starcoder2:3b",
size: 2166 * format.MebiByte,
},
}
mediumModels := []model{
{
name: "qwen3:8b",
size: 6600 * format.MebiByte,
},
{
name: "llama2",
size: 5118 * format.MebiByte,
},
{
name: "deepseek-r1:7b",
size: 5600 * format.MebiByte,
},
{
name: "mistral",
size: 4620 * format.MebiByte,
},
{
name: "dolphin-mistral",
size: 4620 * format.MebiByte,
},
{
name: "gemma:7b",
size: 5000 * format.MebiByte,
},
{
name: "codellama:7b",
size: 5118 * format.MebiByte,
},
}
// These seem to be too slow to be useful...
// largeModels := []model{
// {
// name: "llama2:13b",
// size: 7400 * format.MebiByte,
// },
// {
// name: "codellama:13b",
// size: 7400 * format.MebiByte,
// },
// {
// name: "orca-mini:13b",
// size: 7400 * format.MebiByte,
// },
// {
// name: "gemma:7b",
// size: 5000 * format.MebiByte,
// },
// {
// name: "starcoder2:15b",
// size: 9100 * format.MebiByte,
// },
// }
var chosenModels []model
switch {
case maxVram < 10000*format.MebiByte:
slog.Info("selecting small models")
chosenModels = smallModels
// case maxVram < 30000*format.MebiByte:
default:
slog.Info("selecting medium models")
chosenModels = mediumModels
// default:
// slog.Info("selecting large models")
// chosenModels = largeModels
}
softTimeout, hardTimeout := getTimeouts(t)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
req, resp := GenerateRequests()
for i := range req {
if i > len(chosenModels) {
break
}
req[i].Model = chosenModels[i].name
}
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
// Make sure all the models are pulled before we get started
for _, model := range chosenModels {
require.NoError(t, PullIfMissing(ctx, client, model))
for _, r := range req {
require.NoError(t, PullIfMissing(ctx, client, r.Model))
}
// Determine how many models we can load in parallel before we exceed VRAM
// The intent is to go 1 over what can fit so we force the scheduler to thrash
targetLoadCount := 0
slog.Info("Loading models to find how many can fit in VRAM before overflowing")
for i, model := range chosenModels {
req := &api.GenerateRequest{Model: model}
slog.Info("loading", "model", model)
err = client.Generate(ctx, req, func(response api.GenerateResponse) error { return nil })
if err != nil {
t.Fatalf("failed to load model %s: %s", model, err)
}
targetLoadCount++
if i > 0 {
models, err := client.ListRunning(ctx)
if err != nil {
t.Fatalf("failed to list running models: %s", err)
}
if len(models.Models) < targetLoadCount {
loaded := []string{}
for _, m := range models.Models {
loaded = append(loaded, m.Name)
}
slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
break
}
}
}
if targetLoadCount == len(chosenModels) {
// TODO consider retrying the medium models
slog.Warn("all models being used without exceeding VRAM, set OLLAMA_MAX_VRAM so test can pick larger models")
}
r := rand.New(rand.NewSource(0))
var wg sync.WaitGroup
for i := range targetLoadCount {
consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
for i := 0; i < len(req); i++ {
// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
if i > 1 && consumed > maxVram {
slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
break
}
consumed += chosenModels[i].size
slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
wg.Add(1)
go func(i int) {
defer wg.Done()
reqs, resps := GenerateRequests()
for j := 0; j < 3; j++ {
if time.Now().Sub(started) > softTimeout {
slog.Info("exceeded soft timeout, winding down test")
return
}
k := r.Int() % len(reqs)
reqs[k].Model = chosenModels[i]
slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Prompt)
DoGenerate(ctx, t, client, reqs[k], resps[k],
120*time.Second, // Be extra patient for the model to load initially
10*time.Second, // Once results start streaming, fail if they stall
)
slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
}
}(i)
}

View File

@@ -4,8 +4,6 @@ package integration
import (
"context"
"log/slog"
"sync"
"testing"
"time"
@@ -65,51 +63,3 @@ func TestContextExhaustion(t *testing.T) {
}
DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
}
// Send multiple requests with prior context and ensure the response is coherant and expected
func TestGenerateWithHistory(t *testing.T) {
modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
req, resp := GenerateRequests()
numParallel := 2
iterLimit := 2
softTimeout, hardTimeout := getTimeouts(t)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
// Get the server running (if applicable) warm the model up with a single initial request
slog.Info("loading", "model", modelOverride)
err := client.Generate(ctx,
&api.GenerateRequest{Model: modelOverride, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
func(response api.GenerateResponse) error { return nil },
)
if err != nil {
t.Fatalf("failed to load model %s: %s", modelOverride, err)
}
var wg sync.WaitGroup
wg.Add(numParallel)
for i := range numParallel {
go func(i int) {
defer wg.Done()
k := i % len(req)
req[k].Model = modelOverride
for j := 0; j < iterLimit; j++ {
if time.Now().Sub(started) > softTimeout {
slog.Info("exceeded soft timeout, winding down test")
return
}
slog.Info("Starting", "thread", i, "iter", j)
// On slower GPUs it can take a while to process the concurrent requests
// so we allow a much longer initial timeout
c := DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
req[k].Context = c
req[k].Prompt = "tell me more!"
}
}(i)
}
wg.Wait()
}

File diff suppressed because one or more lines are too long

View File

@@ -472,19 +472,15 @@ func GenerateTestHelper(ctx context.Context, t *testing.T, genReq api.GenerateRe
DoGenerate(ctx, t, client, genReq, anyResp, 30*time.Second, 10*time.Second)
}
func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) []int {
func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) {
stallTimer := time.NewTimer(initialTimeout)
var buf bytes.Buffer
var context []int
fn := func(response api.GenerateResponse) error {
// fmt.Print(".")
buf.Write([]byte(response.Response))
if !stallTimer.Reset(streamTimeout) {
return errors.New("stall was detected while streaming response, aborting")
}
if len(response.Context) > 0 {
context = response.Context
}
return nil
}
@@ -507,7 +503,7 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
case <-done:
if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
return context
return
}
require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
// Verify the response contains the expected data
@@ -524,7 +520,6 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
case <-ctx.Done():
t.Error("outer test context done while waiting for generate")
}
return context
}
// Generate a set of requests
@@ -533,35 +528,55 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
return []api.GenerateRequest{
{
Model: smol,
Prompt: "why is the ocean blue? Be brief but factual in your reply",
Prompt: "why is the ocean blue?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: smol,
Prompt: "why is the color of dirt brown? Be brief but factual in your reply",
Prompt: "why is the color of dirt brown?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: smol,
Prompt: "what is the origin of the US thanksgiving holiday? Be brief but factual in your reply",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: smol,
Prompt: "what is the origin of independence day? Be brief but factual in your reply",
Prompt: "what is the origin of independence day?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: smol,
Prompt: "what is the composition of air? Be brief but factual in your reply",
Prompt: "what is the composition of air?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
},
},
[][]string{
{"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"},
{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"},
{"england", "english", "massachusetts", "pilgrims", "colonists", "independence", "british", "feast", "family", "gatherings", "traditions", "turkey", "colonial", "period", "harvest", "agricultural", "european settlers", "american revolution", "civil war", "16th century", "17th century", "native american", "united states"},
{"sunlight", "scattering", "interact"},
{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles"},
{"england", "english", "massachusetts", "pilgrims", "british"},
{"fourth", "july", "declaration", "independence"},
{"nitrogen", "oxygen", "carbon", "dioxide"},
}

View File

@@ -962,7 +962,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
const bool output_all = false;
// when computing embeddings, all tokens are output
const bool output_all = cparams.embeddings;
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);

View File

@@ -62,22 +62,6 @@ func BackendInit() {
C.llama_backend_init()
}
func EnumerateGPUs() []string {
var ids []string
for i := range C.ggml_backend_dev_count() {
device := C.ggml_backend_dev_get(i)
if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(device, &props)
ids = append(ids, C.GoString(props.id))
}
}
return ids
}
func GetModelArch(modelPath string) (string, error) {
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))

View File

@@ -0,0 +1,32 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sun, 22 Jun 2025 09:22:05 -0700
Subject: [PATCH] temporary prevent rocm+cuda mixed loading
---
ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 3040b2aa..f1e9c180 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path);
- ggml_backend_load_best("cuda", silent, dir_path);
- ggml_backend_load_best("hip", silent, dir_path);
+
+ // Avoid mixed hip+cuda configurations
+ const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+ const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
+ if (!hip_devices && !rocr_devices) {
+ ggml_backend_load_best("cuda", silent, dir_path);
+ } else {
+ ggml_backend_load_best("hip", silent, dir_path);
+ }
+
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);

View File

@@ -13,7 +13,7 @@ checks.
1 file changed, 18 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 57eae461..c7f9dc3a 100644
index 57eae461..9db0c8b5 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud

View File

@@ -1,23 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <git@mxy.ng>
Date: Mon, 18 Aug 2025 16:58:39 -0700
Subject: [PATCH] decode: disable output_all
---
src/llama-context.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 26a5cf9c..6ece5263 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
- // when computing embeddings, all tokens are output
- const bool output_all = cparams.embeddings;
+ const bool output_all = false;
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);

View File

@@ -4,7 +4,7 @@ import (
"fmt"
"log/slog"
"os"
"sort"
"strconv"
"strings"
"github.com/ollama/ollama/api"
@@ -14,79 +14,13 @@ import (
"github.com/ollama/ollama/fs/ggml"
)
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
for _, gl := range gpus.ByLibrary() {
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
if !envconfig.SchedSpread() {
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
"model", modelPath,
"library", sgl[0].Library,
"parallel", numParallel,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", numGPUs)
return gpuSubset
}
}
} else {
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", modelPath,
"library", sgl[0].Library,
"parallel", numParallel,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", len(sgl))
return sgl
}
}
}
return nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 {
return gpus
}
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
}
}
return byLibrary[bestFit]
}
// This algorithm looks for a complete fit to determine if we need to unload other models
func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
// Split up the GPUs by type and try them
var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() {
var layerCount int
estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
@@ -115,7 +49,7 @@ type MemoryEstimate struct {
TotalSize uint64
// For multi-GPU scenarios, this provides the tensor split parameter
TensorSplit []int
TensorSplit string
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
@@ -137,7 +71,7 @@ type MemoryEstimate struct {
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
// Graph size for a partial offload, applies to all GPUs
var graphPartialOffload uint64
@@ -178,9 +112,13 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
for _, projector := range projectors {
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
if llamaEngineProjectorWeights == 0 {
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
opts.NumCtx = max(opts.NumCtx, 2048)
}
layers := f.Tensors().GroupLayers()
@@ -246,7 +184,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int
tensorSplit := make([]int, len(gpus))
layerCounts := make([]int, len(gpus))
gpuAllocations := make([]uint64, len(gpus))
type gs struct {
i int
@@ -310,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > overhead+used+layerSize {
gpuAllocations[g.i] += layerSize
tensorSplit[g.i]++
layerCounts[g.i]++
layerCount++
break
} else {
@@ -335,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > overhead+used+memoryLastLayer {
gpuAllocations[g.i] += memoryLastLayer
tensorSplit[g.i]++
layerCounts[g.i]++
layerCount++
break
}
@@ -350,7 +288,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
// Add the applicable (full or partial) graph allocations
for i := range gpus {
if tensorSplit[i] <= 0 {
if layerCounts[i] <= 0 {
continue
}
if fullyLoaded {
@@ -372,6 +310,14 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
}
memoryRequiredTotal = memoryRequiredPartial + overflow
tensorSplit := ""
if len(gpus) > 1 {
splits := make([]string, len(gpus))
for i, count := range layerCounts {
splits[i] = strconv.Itoa(count)
}
tensorSplit = strings.Join(splits, ",")
}
allocationsList := []string{}
for _, a := range gpuAllocations {
allocationsList = append(allocationsList, format.HumanBytes2(a))

View File

@@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) {
projectors := []string{}
opts := api.DefaultOptions()
t.Run("cpu", func(t *testing.T) {
estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
assert.Equal(t, 0, estimate.Layers)
assert.Equal(t, uint64(0), estimate.Graph)
})
@@ -88,7 +88,7 @@ func TestEstimateGPULayers(t *testing.T) {
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
for i, s := range []struct {
layer0, layer1 uint64
expect0, expect1 int
expect0, expect1 uint64
}{
{1, 1, 1, 1},
{2, 1, 2, 1},
@@ -112,9 +112,9 @@ func TestEstimateGPULayers(t *testing.T) {
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
var layerSums uint64
for _, b := range estimate.GPUSizes {
layerSums += b

File diff suppressed because it is too large Load Diff

View File

@@ -8,178 +8,9 @@ import (
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
"golang.org/x/sync/semaphore"
)
func TestLLMServerFitGPU(t *testing.T) {
type gpu struct {
library string
free int
}
tests := []struct {
name string
gpus []gpu
layers []int
numGPU int
requireFull bool
expected ml.GPULayersList
expectedErr error
}{
{
name: "No GPU",
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{},
},
{
name: "Full single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
},
{
name: "Partial single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
},
{
name: "Single GPU with numGPU 1",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
},
{
name: "Single GPU with numGPU 0",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0,
expected: ml.GPULayersList{},
},
{
name: "Single GPU with numGPU 999",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
},
{
name: "Multi GPU fits on one",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
},
{
name: "Multi GPU split",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
},
{
name: "Multi GPU partial",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 1",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 2",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 999",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
},
{
name: "Multi GPU different libraries",
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
},
{
name: "requireFull",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
requireFull: true,
expectedErr: ErrLoadRequiredFull,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var systemInfo discover.SystemInfo
systemInfo.System.TotalMemory = format.GibiByte
systemInfo.System.FreeMemory = 512 * format.MebiByte
systemInfo.System.FreeSwap = 256 * format.MebiByte
gpus := make(discover.GpuInfoList, len(tt.gpus))
for i := range tt.gpus {
gpus[i].ID = fmt.Sprintf("gpu%d", i)
gpus[i].Library = tt.gpus[i].library
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
}
s := &ollamaServer{
llmServer: llmServer{
totalLayers: uint64(len(tt.layers)),
options: api.Options{
Runner: api.Runner{
NumGPU: tt.numGPU,
},
},
},
}
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
Weights: make([]ml.Memory, s.totalLayers),
Cache: make([]ml.Memory, s.totalLayers),
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
for i := range tt.layers {
s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
}
for i := range s.mem.GPUs {
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
}
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
if err != tt.expectedErr {
t.Fatalf("fitGPU returned error: %v", err)
}
if gpuLayers.Hash() != tt.expected.Hash() {
t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
}
})
}
}
func TestLLMServerCompletionFormat(t *testing.T) {
// This test was written to fix an already deployed issue. It is a bit
// of a mess, and but it's good enough, until we can refactoring the

View File

@@ -5,14 +5,12 @@ import (
"context"
"encoding/binary"
"fmt"
"hash/maphash"
"log/slog"
"math"
"slices"
"strconv"
"strings"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs"
)
@@ -60,89 +58,19 @@ type CacheConfig struct {
MaskBatchPadding int
}
// GPULayers is a set of layers to be allocated on a single GPU
type GPULayers struct {
// ID is the identifier of the GPU, as reported in DeviceMemory
ID string
// Layers is a set of layer indicies to load
Layers []int
}
func (g GPULayers) String() string {
if len(g.Layers) == 0 {
return ""
}
slices.Sort(g.Layers)
contiguous := true
base := g.Layers[0]
for i := range g.Layers {
if g.Layers[i] != base+i {
contiguous = false
break
}
}
if contiguous {
return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
} else {
return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
}
}
// GPULayersList is a set of layer allocations across multiple GPUs
type GPULayersList []GPULayers
func (l GPULayersList) String() string {
if l.Sum() > 0 {
return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
} else {
return fmt.Sprintf("%v", []GPULayers(l))
}
}
// Sum is the total number of layers assigned across all GPUs
func (l GPULayersList) Sum() int {
var sum int
for _, g := range l {
sum += len(g.Layers)
}
return sum
}
var h maphash.Hash
// Hash is an identifier of this layer assignment
func (l GPULayersList) Hash() uint64 {
h.Reset()
for _, g := range l {
if len(g.Layers) > 0 {
h.WriteString(g.ID)
for _, l := range g.Layers {
binary.Write(&h, binary.NativeEndian, int64(l))
}
}
}
return h.Sum64()
}
// BackendParams controls how the backend loads and executes models
type BackendParams struct {
// AllocMemory causes the backend to allocate memory for the model. If
// false, this is only being used for discovering the required amount of
// memory and cannot load the model for running.
AllocMemory bool
// NumThreads sets the number of threads to use if running on the CPU
NumThreads int
// GPULayers is the set of layers to offload to GPUs
GPULayers GPULayersList
// MainGPU is the index of the primary GPU to use
MainGPU int
// NumGPULayers is the number of layers to offload to GPUs
NumGPULayers int
// TensorSplit is the fraction of the model to offload to each GPU
TensorSplit []float32
// FlashAttention indicates that we should use a fused flash attention kernel
FlashAttention bool
@@ -213,28 +141,6 @@ type DeviceMemory struct {
Graph Memory
}
// Allocated returns the total size of the memory that has been successfully
// allocated on this device
func (m DeviceMemory) Allocated() uint64 {
var mem uint64
for _, w := range m.Weights {
if w.Status == Allocated {
mem += w.Size
}
}
for _, c := range m.Cache {
if c.Status == Allocated {
mem += c.Size
}
}
if m.Graph.Status == Allocated {
mem += m.Graph.Size
}
return mem
}
func memoryPresent(mem []Memory) bool {
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
}
@@ -291,58 +197,6 @@ func (m BackendMemory) LogValue() slog.Value {
return slog.GroupValue(attrs...)
}
func sumMemory(mem []Memory) uint64 {
var sum uint64
for _, m := range mem {
sum += m.Size
}
return sum
}
// Log prints a high level summary of the memory (allocated or not)
func (m BackendMemory) Log(level slog.Level) {
var total uint64
for _, gpu := range m.GPUs {
if sum := sumMemory(gpu.Weights); sum > 0 {
slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
for _, gpu := range m.GPUs {
if sum := sumMemory(gpu.Cache); sum > 0 {
slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := sumMemory(m.CPU.Cache); sum > 0 {
slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
for _, gpu := range m.GPUs {
if sum := gpu.Graph.Size; sum > 0 {
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := m.CPU.Graph.Size; sum > 0 {
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
if total > 0 {
slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
}
}
var backends = make(map[string]func(string, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {

View File

@@ -10,7 +10,6 @@ import "C"
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
@@ -63,21 +62,12 @@ var initDevices = sync.OnceFunc(func() {
}
})
type layerDevice struct {
d C.ggml_backend_dev_t
bt C.ggml_backend_buffer_type_t
}
type Backend struct {
// modelPath is the location of the model data
modelPath string
meta *fsggml.GGML
// allocMemory means that memory should be allocated for tensors and not
// just a dry run
allocMemory bool
// tensorLoadTargets maps from the name of the tensor in the file
// to the name that is used by the model definition
tensorLoadTargets map[string][]string
@@ -88,14 +78,11 @@ type Backend struct {
tensors map[string]*C.struct_ggml_tensor
// input is the backend buffer type used for inputs
// input is the backend used for inputs
input C.ggml_backend_buffer_type_t
// output is the backend device used for outputs
output C.ggml_backend_dev_t
// layers is the backend used for repeating layers
layers map[int]layerDevice
layers map[int]C.ggml_backend_buffer_type_t
// requiredMemory is the cumulative memory allocations needed by the backend
requiredMemory *ml.BackendMemory
@@ -112,8 +99,6 @@ type Backend struct {
weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
}
var once sync.Once
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
r, err := os.Open(modelPath)
if err != nil {
@@ -126,17 +111,15 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
return nil, err
}
once.Do(func() {
slog.Info(
"",
"architecture", meta.KV().Architecture(),
"file_type", meta.KV().FileType(),
"name", meta.KV().String("general.name"),
"description", meta.KV().String("general.description"),
"num_tensors", len(meta.Tensors().Items()),
"num_key_values", len(meta.KV()),
)
})
slog.Info(
"",
"architecture", meta.KV().Architecture(),
"file_type", meta.KV().FileType(),
"name", meta.KV().String("general.name"),
"description", meta.KV().String("general.description"),
"num_tensors", len(meta.Tensors().Items()),
"num_key_values", len(meta.KV()),
)
initDevices()
@@ -156,10 +139,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
switch C.ggml_backend_dev_type(d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
bt := C.ggml_backend_dev_buffer_type(d)
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
}
}
@@ -180,8 +160,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
d: d,
bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
})
C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
btDeviceMemory[bt] = &requiredMemory.GPUs[i]
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
var props C.struct_ggml_backend_dev_props
@@ -191,25 +169,56 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
}
useDefaultSplit := true
for _, s := range params.TensorSplit {
if s != 0 {
useDefaultSplit = false
break
}
}
// calculate splits
splits := make([]float32, len(gpus))
if useDefaultSplit {
// default: split on free memory
for i := range splits {
var free, total C.size_t
C.ggml_backend_dev_memory(gpus[i], &free, &total)
splits[i] = float32(free)
}
} else {
splits = params.TensorSplit
}
var sum float32
// cumulative sum of all splits
for i := range splits {
sum += splits[i]
splits[i] = sum
}
// normalize splits
for i := range splits {
splits[i] /= sum
}
// inputs always use cpu
input := cpuDeviceBufferType
assignLayer := func(layer int) deviceBufferType {
for _, p := range params.GPULayers {
for _, l := range p.Layers {
if l == layer {
for i := range requiredMemory.GPUs {
if requiredMemory.GPUs[i].ID == p.ID {
return gpuDeviceBufferTypes[i]
}
}
return cpuDeviceBufferType
}
}
// define a range of gpu layers. anything outside of this range is assigned to the cpu
gpuRangeStart := max(0, blocks-params.NumGPULayers)
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
assignLayer := func(i int) deviceBufferType {
if i < gpuRangeStart || i >= gpuRangeStop {
return cpuDeviceBufferType
}
return cpuDeviceBufferType
index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
if index < 0 || index >= len(gpuDeviceBufferTypes) {
return cpuDeviceBufferType
}
return gpuDeviceBufferTypes[index]
}
// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
@@ -275,9 +284,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
if layer == -1 {
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
if params.AllocMemory {
requiredMemory.InputWeights.Status = ml.Allocated
}
requiredMemory.InputWeights.Status = ml.Allocated
requiredMemory.InputWeights.Size += uint64(size)
} else {
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
@@ -348,14 +355,12 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
}
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
if params.AllocMemory {
for i := range btDeviceMemory[bt].Weights {
if btDeviceMemory[bt].Weights[i].Size != 0 {
if b != nil {
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
} else {
btDeviceMemory[bt].Weights[i].Status = ml.Failed
}
for i := range btDeviceMemory[bt].Weights {
if btDeviceMemory[bt].Weights[i].Size != 0 {
if b != nil {
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
} else {
btDeviceMemory[bt].Weights[i].Status = ml.Failed
}
}
}
@@ -376,9 +381,28 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
bbs[c] = b
}
// Mimic llama runner logs summarizing layers and memory
gpuLayers := 0
for _, layer := range layers {
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
gpuLayers++
}
}
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
switch C.ggml_backend_dev_type(output.d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
slog.Info("offloading output layer to CPU")
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
slog.Info("offloading output layer to GPU")
gpuLayers++
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
slog.Info("offloading output layer to ACCEL")
}
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
for bs := range maps.Values(bbs) {
slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
}
// map tensor names to tensors for easy lookup later
@@ -399,13 +423,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
b := backends[d]
bt := C.ggml_backend_get_default_buffer_type(b)
// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
if !slices.Contains(cpuDeviceBufferType.bts, bt) {
if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
continue
}
}
deviceBufferTypes[d] = bt
schedBackends = append(schedBackends, b)
@@ -420,7 +437,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
return &Backend{
modelPath: modelPath,
allocMemory: params.AllocMemory,
flashAttention: params.FlashAttention,
meta: meta,
tensorLoadTargets: targets,
@@ -436,14 +452,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
schedBackends: schedBackends,
schedBufts: schedBufts,
input: deviceBufferTypes[input.d],
output: output.d,
layers: func() map[int]layerDevice {
m := make(map[int]layerDevice)
layers: func() map[int]C.ggml_backend_buffer_type_t {
m := make(map[int]C.ggml_backend_buffer_type_t)
for i, layer := range layers {
m[i] = layerDevice{
d: layer.d,
bt: deviceBufferTypes[layer.d],
}
m[i] = deviceBufferTypes[layer.d]
}
return m
}(),
@@ -472,30 +484,6 @@ func (b *Backend) Close() {
}
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
if !b.allocMemory {
return errors.New("cannot load model without memory allocation")
}
// Mimic llama runner logs summarizing layers and memory
gpuLayers := 0
for layer := range maps.Values(b.layers) {
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
gpuLayers++
}
}
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
switch C.ggml_backend_dev_type(b.output) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
slog.Info("offloading output layer to CPU")
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
slog.Info("offloading output layer to GPU")
gpuLayers++
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
slog.Info("offloading output layer to ACCEL")
}
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
var doneBytes atomic.Uint64
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
@@ -742,11 +730,11 @@ func (c *Context) Input() ml.Context {
}
func (c *Context) Layer(i int) ml.Context {
if layer, ok := c.b.layers[i]; ok {
if buft, ok := c.b.layers[i]; ok {
return &Context{
b: c.b,
ctx: c.ctx,
buft: layer.bt,
buft: buft,
allocatedBuffers: c.allocatedBuffers,
maxGraphNodes: c.maxGraphNodes,
layer: i,
@@ -804,16 +792,14 @@ func (c *Context) Reserve() {
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
graph.Size += uint64(bufferStatus.size)
if c.b.allocMemory {
if bufferStatus.allocated && graph.Status != ml.Failed {
graph.Status = ml.Allocated
} else {
graph.Status = ml.Failed
}
if bufferStatus.allocated && graph.Status != ml.Failed {
graph.Status = ml.Allocated
} else {
graph.Status = ml.Failed
}
slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
"size", format.HumanBytes2(uint64(bufferStatus.size)))
}
if !reserved {
@@ -882,12 +868,10 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
cache.Size += uint64(size)
if c.b.allocMemory {
if b != nil {
cache.Status = ml.Allocated
} else {
cache.Status = ml.Failed
}
if b != nil {
cache.Status = ml.Allocated
} else {
cache.Status = ml.Failed
}
}
@@ -906,9 +890,7 @@ func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
t := c.newTensor(dtype, shape)
if c.b.allocMemory {
C.ggml_set_zero(t.(*Tensor).t)
}
C.ggml_set_zero(t.(*Tensor).t)
return t
}
@@ -933,7 +915,7 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
t := c.newTensor(ml.DTypeF32, shape)
if c.b.allocMemory && len(s) > 0 {
if len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
@@ -945,7 +927,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
t := c.newTensor(ml.DTypeI32, shape)
if c.b.allocMemory && len(s) > 0 {
if len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
@@ -1568,7 +1550,7 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
// Unchecked to handle quantized types
t := c.newTensor(dtype, shape)
if c.b.allocMemory && len(s) > 0 {
if len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}

View File

@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path);
ggml_backend_load_best("cuda", silent, dir_path);
ggml_backend_load_best("hip", silent, dir_path);
// Avoid mixed hip+cuda configurations
const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
if (!hip_devices && !rocr_devices) {
ggml_backend_load_best("cuda", silent, dir_path);
} else {
ggml_backend_load_best("hip", silent, dir_path);
}
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);

View File

@@ -1,5 +1,5 @@
package arm
// #cgo CXXFLAGS: -std=c++17
// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include -DHWCAP2_SVE2="2"
// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include
import "C"

View File

@@ -12,6 +12,7 @@ import (
"net/http"
"os"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
@@ -215,12 +216,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
}
type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
// is the server ready to process requests?
// protects access to model and image
ready sync.WaitGroup
@@ -728,12 +723,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
}
}
// loadModel allocates memory based on the given parameters and loads the weights. The
// memory allocated is worst case for text models but not for vision.
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) loadModel(
params llama.ModelParams,
mpath string,
lpath []string,
lpath multiLPath,
ppath string,
kvSize int,
kvCacheType string,
@@ -753,10 +757,12 @@ func (s *Server) loadModel(
panic(err)
}
for _, path := range lpath {
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
if err != nil {
panic(err)
if lpath.String() != "" {
for _, path := range lpath {
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
if err != nil {
panic(err)
}
}
}
@@ -777,81 +783,26 @@ func (s *Server) loadModel(
s.ready.Done()
}
// load is the handler called by the Ollama server to process different
// load operations
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.loadMu.Lock()
defer s.loadMu.Unlock()
w.Header().Set("Content-Type", "application/json")
if s.status != llm.ServerStatusLaunched {
http.Error(w, "model already loaded", http.StatusInternalServerError)
return
}
var req llm.LoadRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "bad request", http.StatusBadRequest)
return
}
slog.Info("load", "request", req)
switch req.Operation {
// LoadOperationFit and LoadOperationAlloc have no meaning here - just return a successful response
case llm.LoadOperationCommit:
s.batchSize = req.BatchSize
s.parallel = req.Parallel
s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
gpuIDs := llama.EnumerateGPUs()
tensorSplit := make([]float32, len(gpuIDs))
numGPU := 0
for i := range gpuIDs {
for _, layers := range req.GPULayers {
if gpuIDs[i] == layers.ID {
tensorSplit[i] = float32(len(layers.Layers))
numGPU += len(layers.Layers)
}
}
}
params := llama.ModelParams{
NumGpuLayers: numGPU,
MainGpu: req.MainGPU,
UseMmap: req.UseMmap && len(req.LoraPath) == 0,
TensorSplit: tensorSplit,
Progress: func(progress float32) {
s.progress = progress
},
}
s.status = llm.ServerStatusLoadingModel
go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
case llm.LoadOperationClose:
// No-op for us
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
resp := llm.LoadResponse{Success: true}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
return
}
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
ppath := fs.String("mmproj", "", "Path to projector binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
nGpuLayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGpu := fs.Int("main-gpu", 0, "Main GPU")
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on")
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
@@ -866,11 +817,35 @@ func Execute(args []string) error {
llama.BackendInit()
server := &Server{
modelPath: *mpath,
status: llm.ServerStatusLaunched,
batchSize: *batchSize,
parallel: *parallel,
seqs: make([]*Sequence, *parallel),
seqsSem: semaphore.NewWeighted(int64(*parallel)),
status: llm.ServerStatusLoadingModel,
}
var tensorSplitFloats []float32
if *tensorSplit != "" {
splits := strings.Split(*tensorSplit, ",")
tensorSplitFloats = make([]float32, len(splits))
for i, s := range splits {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats[i] = float32(f)
}
}
params := llama.ModelParams{
NumGpuLayers: *nGpuLayers,
MainGpu: *mainGpu,
UseMmap: !*noMmap && lpaths.String() == "",
TensorSplit: tensorSplitFloats,
Progress: func(progress float32) {
server.progress = progress
},
}
server.ready.Add(1)
go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)
server.cond = sync.NewCond(&server.mu)
@@ -888,7 +863,6 @@ func Execute(args []string) error {
defer listener.Close()
mux := http.NewServeMux()
mux.HandleFunc("POST /load", server.load)
mux.HandleFunc("/embedding", server.embeddings)
mux.HandleFunc("/completion", server.completion)
mux.HandleFunc("/health", server.health)

View File

@@ -14,7 +14,6 @@ import (
"net"
"net/http"
"os"
"reflect"
"regexp"
"runtime"
"strconv"
@@ -260,16 +259,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
}
type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
// lastLoad is the load request from the previous load attempt. Used to
// detect if we can reuse an existing memory allocation.
lastLoad llm.LoadRequest
// is the server ready to process requests?
// protects access to model and image
ready sync.WaitGroup
@@ -731,6 +720,17 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
}
}
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) reserveWorstCaseGraph() error {
ctx := s.model.Backend().NewContext()
defer ctx.Close()
@@ -828,28 +828,15 @@ func (s *Server) reserveWorstCaseGraph() error {
return nil
}
// allocModel pre-allocates the maximum needed memory for a model
// based on the given parameters
func (s *Server) allocModel(
func (s *Server) initModel(
mpath string,
params ml.BackendParams,
loraPath []string,
lpath multiLPath,
parallel int,
kvCacheType string,
kvSize int,
multiUserCache bool,
) (panicErr error) {
// Convert memory allocation panics to errors
defer func() {
if r := recover(); r != nil {
if err, ok := r.(error); ok {
panicErr = err
} else {
panic(r)
}
}
}()
) error {
var err error
s.model, err = model.New(mpath, params)
if err != nil {
@@ -857,7 +844,7 @@ func (s *Server) allocModel(
}
// TODO(jessegross): LoRA loading
if len(loraPath) > 0 {
if lpath.String() != "" {
return errors.New("loras are not yet implemented")
}
@@ -878,122 +865,63 @@ func (s *Server) allocModel(
return s.reserveWorstCaseGraph()
}
// closeModel frees all memory associated with a model
func (s *Server) closeModel() {
s.cache.Close()
s.cache = nil
if s.model != nil {
s.model.Backend().Close()
s.model = nil
}
}
func (s *Server) load(
ctx context.Context,
mpath string,
params ml.BackendParams,
lpath multiLPath,
parallel int,
kvCacheType string,
kvSize int,
multiUserCache bool,
) {
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
if err != nil {
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
// We can't yet handle this but in the future we will
s.cache.Close()
if s.model != nil {
s.model.Backend().Close()
}
}
// loadModel loads the weights for a model. The memory must already
// have been allocated with allocModel
func (s *Server) loadModel() {
err := s.model.Backend().Load(context.TODO(),
panic(err)
}
slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
err = s.model.Backend().Load(ctx,
func(progress float32) {
s.progress = progress
})
if err != nil {
panic(fmt.Errorf("failed to load model: %v", err))
panic(err)
}
s.status = llm.ServerStatusReady
s.ready.Done()
}
// load is the handler called by the Ollama server to process different
// load operations
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.loadMu.Lock()
defer s.loadMu.Unlock()
w.Header().Set("Content-Type", "application/json")
if s.status != llm.ServerStatusLaunched {
http.Error(w, "model already loaded", http.StatusInternalServerError)
return
}
var req llm.LoadRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "bad request", http.StatusBadRequest)
return
}
slog.Info("load", "request", req)
if req.Operation == llm.LoadOperationClose {
s.closeModel()
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
s.lastLoad.Operation = req.Operation
loadModel := s.model == nil || !reflect.DeepEqual(req, s.lastLoad)
s.lastLoad = req
if loadModel {
s.closeModel()
params := ml.BackendParams{
AllocMemory: req.Operation != llm.LoadOperationFit,
NumThreads: req.NumThreads,
GPULayers: req.GPULayers,
FlashAttention: req.FlashAttention,
}
s.batchSize = req.BatchSize
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
if err != nil {
s.closeModel()
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
resp := llm.LoadResponse{Success: false, Memory: noMem.BackendMemory}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
http.Error(w, fmt.Sprintf("failed to initialize model: %v", err), http.StatusInternalServerError)
return
}
}
mem := s.model.Backend().BackendMemory()
switch req.Operation {
case llm.LoadOperationFit:
// LoadOperationFit can't be used for anything else, so just close it
s.closeModel()
// LoadOperationAlloc should stay open for future operations
case llm.LoadOperationCommit:
s.status = llm.ServerStatusLoadingModel
go s.loadModel()
}
resp := llm.LoadResponse{Success: true, Memory: mem}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
return
}
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGPU := fs.Int("main-gpu", 0, "Main GPU")
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on")
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
@@ -1005,17 +933,39 @@ func Execute(args []string) error {
slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
slog.Info("starting ollama engine")
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
server := &Server{
modelPath: *mpath,
status: llm.ServerStatusLaunched,
batchSize: *batchSize,
status: llm.ServerStatusLoadingModel,
}
server.cond = sync.NewCond(&server.mu)
server.ready.Add(1)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// TODO(jessegross): Parameters that need to be implemented:
// no-mmap
var tensorSplitFloats []float32
if *tensorSplit != "" {
splits := strings.Split(*tensorSplit, ",")
tensorSplitFloats = make([]float32, len(splits))
for i, s := range splits {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats[i] = float32(f)
}
}
params := ml.BackendParams{
NumThreads: *threads,
NumGPULayers: *numGPULayers,
MainGPU: *mainGPU,
TensorSplit: tensorSplitFloats,
FlashAttention: *flashAttention,
}
go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
go server.run(ctx)
addr := "127.0.0.1:" + strconv.Itoa(*port)
@@ -1028,7 +978,6 @@ func Execute(args []string) error {
mux := http.NewServeMux()
// TODO: support embeddings
mux.HandleFunc("POST /load", server.load)
mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
})

View File

@@ -2,7 +2,6 @@ package server
import (
"context"
"fmt"
"log/slog"
"slices"
"strings"
@@ -276,9 +275,8 @@ const (
// HarmonyMessageHandler processes harmony events and accumulates content appropriately.
// This is a higher level interface that maps harmony concepts into ollama concepts
type HarmonyMessageHandler struct {
state harmonyMessageState
harmonyParser *HarmonyParser
functionNameMap *FunctionNameMap
state harmonyMessageState
harmonyParser *HarmonyParser
}
// NewHarmonyMessageHandler creates a new message handler
@@ -290,7 +288,6 @@ func NewHarmonyMessageHandler() *HarmonyMessageHandler {
MessageEndTag: "<|end|>",
HeaderEndTag: "<|message|>",
},
functionNameMap: NewFunctionNameMap(),
}
}
@@ -381,97 +378,3 @@ func (a *HarmonyToolCallAccumulator) Drain() (*string, string) {
func (a *HarmonyToolCallAccumulator) Content() string {
return a.acc.String()
}
// FunctionNameMap maps a user-specified function name to a valid function
// name for harmony (which look like TypeScript identifiers). This is needed to
// transform user-specified function names, which might contain characters that
// are not allowed in TypeScript identifiers
type FunctionNameMap struct {
userToHarmony map[string]string
harmonyToUser map[string]string
}
func NewFunctionNameMap() *FunctionNameMap {
return &FunctionNameMap{
userToHarmony: make(map[string]string),
harmonyToUser: make(map[string]string),
}
}
func (m *FunctionNameMap) ConvertAndAdd(userFunctionName string) string {
harmonyFunctionName := m.deriveName(userFunctionName)
m.userToHarmony[userFunctionName] = harmonyFunctionName
m.harmonyToUser[harmonyFunctionName] = userFunctionName
return harmonyFunctionName
}
// OriginalFromConverted looks up the reverse-mapping of a previously-converted
// user->harmony function name. To unmap reliably, the mapping must exist, as
// the conversion process is not reversible without the appropriate state
func (m *FunctionNameMap) OriginalFromConverted(harmonyFunctionName string) string {
if userFunctionName, ok := m.harmonyToUser[harmonyFunctionName]; ok {
return userFunctionName
}
slog.Warn("harmony parser: no reverse mapping found for function name", "harmonyFunctionName", harmonyFunctionName)
// fallback to the original function name if we can't find a mapping
return harmonyFunctionName
}
// convertToValidChars converts a user-specified function name to a valid
// TypeScript identifier.
//
// Limitations:
//
// - This doesn't restrict reserved TypeScript keywords.
// - We don't perform a real ID_Start/ID_Continue check, and instead use the more
// restrictive unicode.IsLetter/unicode.IsDigit check. Unclear what kind of
// identifiers these models were trained on, so in the end we might want to
// convert unicode-heavy identifiers to their closest ASCII equivalents.
func (m *FunctionNameMap) convertToValidChars(userFunctionName string) string {
mapper := func(r rune) rune {
// first, replace certain characters with underscores
if r == ' ' || r == '-' || r == '.' {
return '_'
}
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$' {
return r
}
// finally, remove any other characters
return -1
}
candidate := strings.Map(mapper, userFunctionName)
// set a default name if we end up with nothing left
if candidate == "" {
return "unnamed"
}
// if the candidate starts with a number, prepend an underscore to make it a
// valid identifier
if unicode.IsDigit(rune(candidate[0])) {
candidate = "_" + candidate
}
return candidate
}
func (m *FunctionNameMap) deriveName(userFunctionName string) string {
originalCandidate := m.convertToValidChars(userFunctionName)
candidate := originalCandidate
// Check for dupes, and if so, add a number to the end.
// We start at 2 because if we have dupes and the first is never renamed, it
// makes sense for them to be named, say, `f`, `f_2`, `f_3`
count := 2
for {
if _, exists := m.harmonyToUser[candidate]; !exists {
break
}
candidate = fmt.Sprintf("%s_%d", originalCandidate, count)
count++
}
return candidate
}

View File

@@ -467,71 +467,3 @@ func TestHarmonyParserStreaming(t *testing.T) {
})
}
}
// TestFunctionConvertToValidChars tests only FunctionNameMap.convert(), which doesn't
// handle any saving (and therefore no dupe handling)
func TestFunctionConvertToValidChars(t *testing.T) {
tests := []struct {
name string
in string
want string
}{
{name: "replace spaces with underscores", in: "get weather", want: "get_weather"},
{name: "replace hyphens with underscores", in: "get-weather", want: "get_weather"},
{name: "replace periods with underscores", in: "get.weather", want: "get_weather"},
{name: "disallow non-word characters", in: "get weather!", want: "get_weather"},
{name: "strip out invalid non-alphanumeric unicode characters", in: "a🫠bc", want: "abc"},
{name: "names that only contain invalid characters", in: "🫠", want: "unnamed"},
{name: "leading number", in: "123", want: "_123"},
{name: "$ allowed", in: "$", want: "$"},
// show that we allow weird unicode letter characters, though we might want
// to convert them to their closest ASCII equivalents in the future
{name: "allow weird unicode letter characters", in: "𝓸𝓵𝓵𝓪𝓶𝓪", want: "𝓸𝓵𝓵𝓪𝓶𝓪"},
// names that look like words but are invalid (i.e., not ID_Start/ID_Continue)
{name: "disallow non-word characters that look like words", in: "ⓞⓛⓛⓐⓜⓐ123", want: "_123"},
}
for i, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
parser := NewFunctionNameMap()
got := parser.convertToValidChars(tt.in)
if got != tt.want {
t.Errorf("case %d: got %q, want %q", i, got, tt.want)
}
})
}
}
func TestFunctionConvertAndAdd(t *testing.T) {
// make a fresh map for each test, but within a test use the same map so we can test for dupe handling
tests := []struct {
name string
in []string
want []string
}{
{name: "basic dupe handling", in: []string{"get weather", "get weather"}, want: []string{"get_weather", "get_weather_2"}},
{name: "dupes from different user-specified names", in: []string{"get weather", "get_weather", "get-weather"}, want: []string{"get_weather", "get_weather_2", "get_weather_3"}},
{name: "non dupes after dupes", in: []string{"get weather", "get_weather", "get-weather", "something-different"}, want: []string{"get_weather", "get_weather_2", "get_weather_3", "something_different"}},
{name: "multiple sets of dupes", in: []string{"a", "a", "b", "a", "a", "b", "a"}, want: []string{"a", "a_2", "b", "a_3", "a_4", "b_2", "a_5"}},
}
for i, tt := range tests {
parser := NewFunctionNameMap()
t.Run(tt.name, func(t *testing.T) {
for j, in := range tt.in {
got := parser.ConvertAndAdd(in)
want := tt.want[j]
if got != want {
t.Errorf("case %d: got %q, want %q", i, got, want)
}
// check that the maps are correct
if parser.userToHarmony[in] != want {
t.Errorf("case %d: userToHarmony[%q] = %q, want %q", i, in, parser.userToHarmony[in], want)
}
if parser.harmonyToUser[want] != in {
t.Errorf("case %d: harmonyToUser[%q] = %q, want %q", i, want, parser.harmonyToUser[want], in)
}
}
})
}
}

View File

@@ -314,19 +314,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
prompt = b.String()
}
// If debug mode is enabled, return the rendered template instead of calling the model
if req.DebugRenderOnly {
c.JSON(http.StatusOK, api.DebugTemplateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
DebugInfo: api.DebugInfo{
RenderedTemplate: prompt,
ImageCount: len(images),
},
})
return
}
var thinkingState *thinking.Parser
if !useHarmony {
openingTag, closingTag := thinking.InferTags(m.Template.Template)
@@ -1490,14 +1477,14 @@ func (s *Server) PsHandler(c *gin.Context) {
mr := api.ProcessModelResponse{
Model: model.ShortName,
Name: model.ShortName,
Size: int64(v.totalSize),
SizeVRAM: int64(v.vramSize),
Size: int64(v.estimatedTotal),
SizeVRAM: int64(v.estimatedVRAM),
Digest: model.Digest,
Details: modelDetails,
ExpiresAt: v.expiresAt,
}
if v.Options != nil {
mr.ContextLength = v.Options.NumCtx
mr.ContextLength = v.Options.NumCtx / v.numParallel
}
// The scheduler waits to set expiresAt, so if a model is loading it's
// possible that it will be set to the unix epoch. For those cases, just
@@ -1603,12 +1590,24 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
msgs = filterThinkTags(msgs, m)
var harmonyMessageHandler *HarmonyMessageHandler
var harmonyToolParser *HarmonyToolCallAccumulator
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
if err != nil {
slog.Error("chat prompt error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
useHarmony := shouldUseHarmony(*m)
processedTools := req.Tools
// Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
return
}
var harmonyMessageHandler *HarmonyMessageHandler
var harmonyToolParser *HarmonyToolCallAccumulator
if useHarmony {
harmonyMessageHandler = NewHarmonyMessageHandler()
var lastMessage *api.Message
@@ -1617,40 +1616,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
harmonyToolParser = harmonyMessageHandler.CreateToolParser()
// make a copy of tools to pass to the chat prompt. Function names may be
// renamed to be valid Harmony function names.
processedTools = make([]api.Tool, len(req.Tools))
copy(processedTools, req.Tools)
for i, tool := range processedTools {
processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
}
}
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
if err != nil {
slog.Error("chat prompt error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// If debug mode is enabled, return the rendered template instead of calling the model
if req.DebugRenderOnly {
c.JSON(http.StatusOK, api.DebugTemplateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
DebugInfo: api.DebugInfo{
RenderedTemplate: prompt,
ImageCount: len(images),
},
})
return
}
// Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
return
}
var thinkingState *thinking.Parser
@@ -1705,7 +1670,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
toolName, toolContent := harmonyToolParser.Drain()
if toolName != nil {
*toolName = strings.TrimPrefix(*toolName, "functions.")
*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
var args api.ToolCallFunctionArguments
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())

View File

@@ -1,413 +0,0 @@
package server
import (
"bytes"
"encoding/json"
"net/http"
"testing"
"time"
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
)
func TestGenerateDebugRenderOnly(t *testing.T) {
gin.SetMode(gin.TestMode)
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
go s.sched.Run(t.Context())
// Create a test model
stream := false
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-model",
Files: map[string]string{"file.gguf": digest},
Template: "{{ .Prompt }}",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
tests := []struct {
name string
request api.GenerateRequest
expectDebug bool
expectTemplate string
expectNumImages int
}{
{
name: "debug render only enabled",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello, world!",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "Hello, world!",
},
{
name: "debug render only disabled",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello, world!",
DebugRenderOnly: false,
},
expectDebug: false,
},
{
name: "debug render only with system prompt",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "User question",
System: "You are a helpful assistant",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "User question",
},
{
name: "debug render only with template",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello",
Template: "PROMPT: {{ .Prompt }}",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "PROMPT: Hello",
},
{
name: "debug render only with images",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Describe this image",
Images: []api.ImageData{[]byte("fake-image-data")},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "[img-0]\n\nDescribe this image",
expectNumImages: 1,
},
{
name: "debug render only with raw mode",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Raw prompt text",
Raw: true,
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "Raw prompt text",
},
}
for _, tt := range tests {
// Test both with and without streaming
streamValues := []bool{false, true}
for _, stream := range streamValues {
streamSuffix := ""
if stream {
streamSuffix = " (streaming)"
}
t.Run(tt.name+streamSuffix, func(t *testing.T) {
req := tt.request
req.Stream = &stream
w := createRequest(t, s.GenerateHandler, req)
if tt.expectDebug {
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
}
var response api.DebugTemplateResponse
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
if response.Model != tt.request.Model {
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
}
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
}
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
}
} else {
// When debug is disabled, it should attempt normal processing
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
}
}
})
}
}
}
func TestChatDebugRenderOnly(t *testing.T) {
gin.SetMode(gin.TestMode)
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
go s.sched.Run(t.Context())
// Create a test model
stream := false
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-model",
Files: map[string]string{"file.gguf": digest},
Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
tests := []struct {
name string
request api.ChatRequest
expectDebug bool
expectTemplate string
expectNumImages int
}{
{
name: "chat debug render only enabled",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "system", Content: "You are a helpful assistant"},
{Role: "user", Content: "Hello"},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "system: You are a helpful assistant\nuser: Hello\n",
},
{
name: "chat debug render only disabled",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Hello"},
},
DebugRenderOnly: false,
},
expectDebug: false,
},
{
name: "chat debug with assistant message",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "Hi there!"},
{Role: "user", Content: "How are you?"},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "user: Hello\nassistant: Hi there!\nuser: How are you?\n",
},
{
name: "chat debug with images",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's in this image?",
Images: []api.ImageData{[]byte("fake-image-data")},
},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "user: [img-0]What's in this image?\n",
expectNumImages: 1,
},
{
name: "chat debug with tools",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Get the weather"},
},
Tools: api.Tools{
{
Type: "function",
Function: api.ToolFunction{
Name: "get_weather",
Description: "Get weather information",
},
},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information\",\"parameters\":{\"type\":\"\",\"required\":null,\"properties\":null}}}]user: Get the weather\n",
},
}
for _, tt := range tests {
// Test both with and without streaming
streamValues := []bool{false, true}
for _, stream := range streamValues {
streamSuffix := ""
if stream {
streamSuffix = " (streaming)"
}
t.Run(tt.name+streamSuffix, func(t *testing.T) {
req := tt.request
req.Stream = &stream
w := createRequest(t, s.ChatHandler, req)
if tt.expectDebug {
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
}
var response api.DebugTemplateResponse
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
if response.Model != tt.request.Model {
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
}
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
}
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
}
} else {
// When debug is disabled, it should attempt normal processing
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
}
}
})
}
}
}

View File

@@ -77,13 +77,12 @@ func TestGenerateChat(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
@@ -621,13 +620,12 @@ func TestGenerate(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}

View File

@@ -277,11 +277,10 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
@@ -428,11 +427,10 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
@@ -610,11 +608,10 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}

View File

@@ -28,6 +28,7 @@ type LlmRequest struct {
ctx context.Context //nolint:containedctx
model *Model
opts api.Options
origNumCtx int // Track the initial ctx request
sessionDuration *api.Duration
successCh chan *runnerRef
errCh chan error
@@ -40,17 +41,10 @@ type Scheduler struct {
expiredCh chan *runnerRef
unloadedCh chan any
// loadedMu protects loaded and activeLoading
loaded map[string]*runnerRef
loadedMu sync.Mutex
// activeLoading is the model that we are currently working on loading,
// including by evicting one or more other models. We can only load
// one model at a time but new requests to models that already loaded can
// happen in parallel
activeLoading llm.LlamaServer
loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func() discover.GpuInfoList
getCpuFn func() discover.GpuInfoList
@@ -62,6 +56,9 @@ type Scheduler struct {
// on a large GPU can cause stalling
var defaultModelsPerGPU = 3
// Default automatic value for parallel setting
var defaultParallel = 1
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
func InitScheduler(ctx context.Context) *Scheduler {
@@ -82,36 +79,24 @@ func InitScheduler(ctx context.Context) *Scheduler {
}
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, m *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
if m.CheckCapabilities(model.CapabilityVision) == nil {
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
req := &LlmRequest{
ctx: c,
model: m,
model: model,
opts: opts,
sessionDuration: sessionDuration,
successCh: make(chan *runnerRef, 1),
successCh: make(chan *runnerRef),
errCh: make(chan error, 1),
}
s.loadedMu.Lock()
runner := s.loaded[req.model.ModelPath]
s.loadedMu.Unlock()
if runner != nil && !runner.needsReload(c, req) {
req.useLoadedRunner(runner, s.finishedReqCh)
} else {
select {
case s.pendingReqCh <- req:
default:
req.errCh <- ErrMaxQueue
}
select {
case s.pendingReqCh <- req:
default:
req.errCh <- ErrMaxQueue
}
return req.successCh, req.errCh
}
@@ -137,11 +122,21 @@ func (s *Scheduler) processPending(ctx context.Context) {
case pending := <-s.pendingReqCh:
// Block other requests until we get this pending request running
pending.schedAttempts++
if pending.origNumCtx == 0 {
pending.origNumCtx = pending.opts.NumCtx
}
if pending.ctx.Err() != nil {
slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue
}
numParallel := int(envconfig.NumParallel())
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1
slog.Warn("mllama does not currently support parallel requests")
}
for {
var runnerToExpire *runnerRef
@@ -200,26 +195,84 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
// Update free memory from currently loaded models
s.updateFreeSpace(gpus)
// Embedding models should always be loaded with parallel=1
if pending.model.CheckCapabilities(model.CapabilityCompletion) != nil {
numParallel = 1
}
if loadedCount == 0 {
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
if len(gpus) == 1 && gpus[0].Library == "cpu" {
// simplifying assumption of defaultParallel when in CPU mode
if numParallel <= 0 {
numParallel = defaultParallel
}
pending.opts.NumCtx = pending.origNumCtx * numParallel
if loadedCount == 0 {
slog.Debug("cpu mode with first model, loading")
s.loadFn(pending, ggml, gpus, numParallel)
break
}
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
if runnerToExpire == nil {
slog.Debug("cpu mode with available system memory or first model, loading")
s.loadFn(pending, ggml, gpus, numParallel)
break
}
// else we need to expire a runner
} else if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath)
s.loadFn(pending, ggml, gpus, false)
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
if g != nil {
gpus = g
} else {
// Only allow partial loads when this is the first model
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
}
s.loadFn(pending, ggml, gpus, numParallel)
break
}
// More than one loaded model, so we have to see if the
// new one fits
if runnerToExpire == nil {
// More than one loaded model, so we have to see if the
// new one fits
//
// We want to avoid loading on any GPUs that have other
// models still loading on them to avoid potential races
// with VRAM consumption ramping up during load
availGpus := s.filterGPUsWithoutLoadingModels(gpus)
needEvict := s.loadFn(pending, ggml, gpus, true)
if !needEvict {
slog.Debug("new model fits with existing models, loading")
break
// Update free memory from currently loaded models
s.updateFreeSpace(availGpus)
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
if fitGpus != nil {
slog.Debug("new model fits with existing models, loading")
s.loadFn(pending, ggml, fitGpus, numParallel)
break
}
// We couldn't find a set of GPUs to fully load the new
// model. If no other models are loading (both GPU lists
// are the same) then we need to unload another model to
// make room
if len(availGpus) < len(gpus) {
// There are other requests pending, and this one
// needs more time, so put it on the back of the
// queue so that we might satisfy other pending
// requests that aren't blocked
go func() {
// Process in a go routine to avoid deadlocking
// the scheduler if our queue is full
slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
time.Sleep(s.reschedDelay)
s.pendingReqCh <- pending
}()
break
}
runnerToExpire = s.findRunnerToUnload()
}
runnerToExpire = s.findRunnerToUnload()
}
if runnerToExpire == nil {
@@ -240,6 +293,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
runnerToExpire.refMu.Unlock()
// Wait for the unload to happen
// Note: at this point we're queueing up all incoming requests, even if they were for
// a different model that's loaded and not scheduled to be removed.
slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire)
select {
case <-ctx.Done():
@@ -379,72 +434,26 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}()
}
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
numParallel := int(envconfig.NumParallel())
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
if numParallel < 1 {
numParallel = 1
}
// Embedding models should always be loaded with parallel=1
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
numParallel = 1
}
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1
slog.Warn("mllama does not currently support parallel requests")
}
sessionDuration := envconfig.KeepAlive()
if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration
}
s.loadedMu.Lock()
llama := s.activeLoading
if llama == nil {
var err error
llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
// check for model compatibility
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
}
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
req.errCh <- err
s.loadedMu.Unlock()
return false
}
s.activeLoading = llama
} else {
if s.activeLoading.ModelPath() != req.model.ModelPath {
panic(fmt.Errorf("attempting to load different model after eviction (original %v new %v)", s.activeLoading.ModelPath(), req.model.ModelPath))
}
}
s.loadedMu.Unlock()
err := llama.Load(req.ctx, gpus, requireFull)
llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
if errors.Is(err, llm.ErrLoadRequiredFull) {
return true
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
// check for model compatibility
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
}
slog.Info("Load failed", "model", req.model.ModelPath, "error", err)
s.activeLoading.Close()
s.activeLoading = nil
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
req.errCh <- err
return false
return
}
runner := &runnerRef{
model: req.model,
modelPath: req.model.ModelPath,
@@ -452,8 +461,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
Options: &req.opts,
sessionDuration: sessionDuration,
gpus: gpus,
vramSize: llama.VRAMSize(),
totalSize: llama.TotalSize(),
estimatedVRAM: llama.EstimatedVRAM(),
estimatedTotal: llama.EstimatedTotal(),
loading: true,
pid: llama.Pid(),
}
@@ -468,7 +477,6 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
oldRunner.unload()
oldRunner.refMu.Unlock()
}
s.activeLoading = nil
s.loaded[req.model.ModelPath] = runner
slog.Info("loaded runners", "count", len(s.loaded))
s.loadedMu.Unlock()
@@ -495,8 +503,6 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
}()
req.successCh <- runner
}()
return false
}
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
@@ -515,7 +521,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
r.refMu.Lock()
if r.llama != nil {
for _, gpu := range allGpus {
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.VRAMByGPU(gpu.ID)
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
}
} else {
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@@ -542,17 +548,41 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
}
}
// While models are loading the VRAM consumption numbers will be indeterminate, so we have
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
ret := append(discover.GpuInfoList{}, allGpus...)
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
for _, runner := range s.loaded {
if runner.loading {
slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
for _, busyGPU := range runner.gpus {
for i := range ret {
if ret[i].ID == busyGPU.ID {
ret = append(ret[:i], ret[i+1:]...)
break
}
}
}
}
}
return ret
}
// TODO consolidate sched_types.go
type runnerRef struct {
refMu sync.Mutex
refCount uint // prevent unloading if > 0
llama llm.LlamaServer
pid int
loading bool // True only during initial load, then false forever
gpus discover.GpuInfoList // Recorded at time of provisioning
vramSize uint64
totalSize uint64
llama llm.LlamaServer
pid int
loading bool // True only during initial load, then false forever
gpus discover.GpuInfoList // Recorded at time of provisioning
estimatedVRAM uint64
estimatedTotal uint64
sessionDuration time.Duration
expireTimer *time.Timer
@@ -601,6 +631,9 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
optsNew.NumGPU = -1
}
// Normalize the NumCtx for parallelism
optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
@@ -661,7 +694,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any {
freeMemoryNow += gpu.FreeMemory
}
// If we're within ~80% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.8 {
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner)
finished <- struct{}{}
return
@@ -686,8 +719,8 @@ func (runner *runnerRef) LogValue() slog.Value {
)
}
attrs = append(attrs,
slog.String("size", format.HumanBytes2(runner.totalSize)),
slog.String("vram", format.HumanBytes2(runner.vramSize)),
slog.String("size", format.HumanBytes2(runner.estimatedTotal)),
slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)),
slog.Int("parallel", runner.numParallel),
slog.Int("pid", runner.pid),
slog.String("model", runner.modelPath),
@@ -717,7 +750,95 @@ func (a ByDurationAndName) Less(i, j int) bool {
// type BySize []*runnerRef
// func (a BySize) Len() int { return len(a) }
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// func (a BySize) Less(i, j int) bool { return a[i].vramSize < a[j].vramSize }
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
// opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var numParallelToTry []int
if *numParallel <= 0 {
// If no specific parallel setting was provided, try larger then smaller, always end with 1
numParallelToTry = append(numParallelToTry, defaultParallel, 1)
} else {
numParallelToTry = []int{*numParallel}
}
for _, gl := range gpus.ByLibrary() {
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
if !envconfig.SchedSpread() {
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", numGPUs)
*numParallel = p
return gpuSubset
}
}
}
} else {
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", len(sgl))
*numParallel = p
return sgl
}
}
}
}
return nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
if *numParallel <= 0 {
*numParallel = 1
req.opts.NumCtx = req.origNumCtx
}
byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 {
return gpus
}
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, *numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
}
}
return byLibrary[bestFit]
}
// findRunnerToUnload finds a runner to unload to make room for a new model
func (s *Scheduler) findRunnerToUnload() *runnerRef {
@@ -754,13 +875,6 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
func (s *Scheduler) unloadAllRunners() {
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
if s.activeLoading != nil {
slog.Debug("shutting down currently loading runner")
s.activeLoading.Close()
s.activeLoading = nil
}
for model, runner := range s.loaded {
if runner.llama != nil {
slog.Debug("shutting down runner", "model", model)
@@ -787,3 +901,18 @@ func (s *Scheduler) expireRunner(model *Model) {
runner.refMu.Unlock()
}
}
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts, req.opts.NumCtx/req.origNumCtx)
if estimate.TotalSize <= gpus[0].FreeMemory {
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
return nil
}
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
return s.findRunnerToUnload()
}

View File

@@ -52,7 +52,7 @@ func TestLoad(t *testing.T) {
return nil, errors.New("something failed to load model blah")
}
gpus := discover.GpuInfoList{}
s.load(req, f, gpus, false)
s.load(req, f, gpus, 0)
require.Empty(t, req.successCh)
require.Len(t, req.errCh, 1)
s.loadedMu.Lock()
@@ -61,17 +61,16 @@ func TestLoad(t *testing.T) {
err := <-req.errCh
require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}
s.load(req, f, gpus, false)
s.load(req, f, gpus, 0)
select {
case err := <-req.errCh:
require.NoError(t, err)
case resp := <-req.successCh:
require.Equal(t, uint64(10), resp.vramSize)
require.Equal(t, uint64(10), resp.estimatedVRAM)
require.Equal(t, uint(1), resp.refCount)
s.loadedMu.Lock()
require.Len(t, s.loaded, 1)
@@ -80,7 +79,7 @@ func TestLoad(t *testing.T) {
req.model.ModelPath = "dummy_model_path"
server.waitResp = errors.New("wait failure")
s.load(req, f, gpus, false)
s.load(req, f, gpus, 0)
select {
case err := <-req.errCh:
require.Contains(t, err.Error(), "wait failure")
@@ -105,11 +104,10 @@ type reqBundle struct {
}
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
scenario.srv.modelPath = model
return scenario.srv, nil
}
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration) *reqBundle {
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
b := &reqBundle{}
b.ctx, b.ctxDone = context.WithCancel(ctx)
t.Helper()
@@ -146,7 +144,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
}
b.srv = &mockLlm{vramSize: vramSize, vramByGPU: map[string]uint64{"": vramSize}}
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
return b
}
@@ -264,10 +262,10 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
// Multiple loaded models
a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
b := newScenarioRequest(t, ctx, "ollama-model-3b", 10*format.GigaByte, nil)
c := newScenarioRequest(t, ctx, "ollama-model-4a", 10*format.GigaByte, nil)
c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 10*format.GigaByte, nil) // Needs prior unloaded
b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer
@@ -420,12 +418,11 @@ func TestExpireRunner(t *testing.T) {
var f *ggml.GGML
gpus := discover.GpuInfoList{}
server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}
s.load(req, f, gpus, false)
s.load(req, f, gpus, 0)
select {
case err := <-req.errCh:
@@ -509,7 +506,7 @@ func TestUseLoadedRunner(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2},
}
finished := make(chan *LlmRequest)
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
req.useLoadedRunner(r1, finished)
require.Equal(t, uint(1), r1.refCount)
@@ -544,8 +541,8 @@ func TestUpdateFreeSpace(t *testing.T) {
gpus[0].FreeMemory = 900
gpus[1].TotalMemory = 2000
gpus[1].FreeMemory = 1900
llm1 := &mockLlm{vramByGPU: map[string]uint64{"1": 50, "2": 50}}
llm2 := &mockLlm{vramByGPU: map[string]uint64{"1": 125, "2": 75}}
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
@@ -560,6 +557,40 @@ func TestUpdateFreeSpace(t *testing.T) {
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
}
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
gpus := discover.GpuInfoList{
{
Library: "cuda",
ID: "0",
},
{
Library: "cuda",
ID: "1",
},
}
r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
s := InitScheduler(ctx)
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loadedMu.Unlock()
tmp := s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "1", tmp[0].ID)
r1.gpus = discover.GpuInfoList{gpus[1]}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "0", tmp[0].ID)
r1.gpus = discover.GpuInfoList{}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 2)
}
func TestFindRunnerToUnload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
@@ -584,7 +615,7 @@ func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
llm := &mockLlm{vramByGPU: map[string]uint64{}}
llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
do := api.DefaultOptions()
runner := &runnerRef{
model: &Model{
@@ -631,8 +662,8 @@ func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
llm2 := &mockLlm{vramByGPU: map[string]uint64{}}
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
s := InitScheduler(ctx)
s.unloadAllRunners()
@@ -650,7 +681,7 @@ func TestUnloadAllRunners(t *testing.T) {
}
func TestUnload(t *testing.T) {
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, numParallel: 1}
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
r1.unload()
@@ -676,40 +707,62 @@ func TestAlreadyCanceled(t *testing.T) {
require.Empty(t, scenario1a.req.successCh)
}
type mockLlm struct {
modelPath string
pingResp error
waitResp error
completionResp error
embeddingResp []float32
embeddingRespErr error
tokenizeResp []int
tokenizeRespErr error
detokenizeResp string
detonekizeRespErr error
closeResp error
closeCalled bool
vramSize uint64
totalSize uint64
vramByGPU map[string]uint64
}
func TestHomogeneousGPUs(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
s := InitScheduler(ctx)
func (s *mockLlm) ModelPath() string {
return s.modelPath
}
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
if requireFull {
for _, g := range gpus {
if g.FreeMemory >= s.vramSize {
return nil
}
s.getGpuFn = func() discover.GpuInfoList {
// Set memory values to require the model to be spread
gpus := []discover.GpuInfo{
{Library: "cuda"},
{Library: "rocm"},
}
return llm.ErrLoadRequiredFull
gpus[0].TotalMemory = 1 * format.GibiByte
gpus[0].FreeMemory = 256 * format.MebiByte
gpus[1].TotalMemory = 1 * format.GibiByte
gpus[1].FreeMemory = 256 * format.MebiByte
return gpus
}
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
require.Len(t, gpus, 1)
return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
}
slog.Info("a")
s.pendingReqCh <- a.req
require.Len(t, s.pendingReqCh, 1)
s.Run(ctx)
select {
case resp := <-a.req.successCh:
require.Equal(t, resp.llama, a.srv)
require.Empty(t, s.pendingReqCh)
require.Empty(t, a.req.errCh)
case err := <-a.req.errCh:
t.Fatal(err.Error())
case <-ctx.Done():
t.Fatal("timeout")
}
return nil
}
type mockLlm struct {
pingResp error
waitResp error
completionResp error
embeddingResp []float32
embeddingRespErr error
tokenizeResp []int
tokenizeRespErr error
detokenizeResp string
detonekizeRespErr error
closeResp error
closeCalled bool
estimatedVRAM uint64
estimatedTotal uint64
estimatedVRAMByGPU map[string]uint64
}
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
@@ -732,7 +785,7 @@ func (s *mockLlm) Close() error {
s.closeCalled = true
return s.closeResp
}
func (s *mockLlm) VRAMSize() uint64 { return s.vramSize }
func (s *mockLlm) TotalSize() uint64 { return s.totalSize }
func (s *mockLlm) VRAMByGPU(gpuid string) uint64 { return s.vramByGPU[gpuid] }
func (s *mockLlm) Pid() int { return -1 }
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
func (s *mockLlm) Pid() int { return -1 }