Compare commits
3 Commits
v0.11.5-rc
...
mxyng/16-b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69f3dfdedf | ||
|
|
7bd3f0269c | ||
|
|
276c4df770 |
@@ -22,7 +22,7 @@
|
||||
"name": "CUDA 12",
|
||||
"inherits": [ "CUDA" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
|
||||
"CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;120-virtual",
|
||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
|
||||
}
|
||||
},
|
||||
@@ -30,14 +30,14 @@
|
||||
"name": "JetPack 5",
|
||||
"inherits": [ "CUDA" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_CUDA_ARCHITECTURES": "72;87"
|
||||
"CMAKE_CUDA_ARCHITECTURES": "72-virtual;87-virtual"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "JetPack 6",
|
||||
"inherits": [ "CUDA" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_CUDA_ARCHITECTURES": "87"
|
||||
"CMAKE_CUDA_ARCHITECTURES": "87-virtual"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@@ -86,8 +86,6 @@ RUN go mod download
|
||||
COPY . .
|
||||
ARG GOFLAGS="'-ldflags=-w -s'"
|
||||
ENV CGO_ENABLED=1
|
||||
ARG CGO_CFLAGS
|
||||
ARG CGO_CXXFLAGS
|
||||
RUN --mount=type=cache,target=/root/.cache/go-build \
|
||||
go build -trimpath -buildmode=pie -o /bin/ollama .
|
||||
|
||||
|
||||
@@ -411,8 +411,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
|
||||
- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
|
||||
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
|
||||
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
|
||||
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
|
||||
|
||||
### Cloud
|
||||
|
||||
@@ -539,8 +537,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
|
||||
- [Ollama for D](https://github.com/kassane/ollama-d)
|
||||
- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
|
||||
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
|
||||
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
|
||||
|
||||
### Mobile
|
||||
|
||||
|
||||
21
api/types.go
21
api/types.go
@@ -90,10 +90,6 @@ type GenerateRequest struct {
|
||||
// (request that thinking _not_ be used) and unset (use the old behavior
|
||||
// before this option was introduced)
|
||||
Think *ThinkValue `json:"think,omitempty"`
|
||||
|
||||
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
|
||||
// template instead of calling the model.
|
||||
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
|
||||
}
|
||||
|
||||
// ChatRequest describes a request sent by [Client.Chat].
|
||||
@@ -124,10 +120,6 @@ type ChatRequest struct {
|
||||
// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
|
||||
// for supported models.
|
||||
Think *ThinkValue `json:"think,omitempty"`
|
||||
|
||||
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
|
||||
// template instead of calling the model.
|
||||
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
|
||||
}
|
||||
|
||||
type Tools []Tool
|
||||
@@ -316,19 +308,6 @@ type ChatResponse struct {
|
||||
Metrics
|
||||
}
|
||||
|
||||
// DebugInfo contains debug information for template rendering
|
||||
type DebugInfo struct {
|
||||
RenderedTemplate string `json:"rendered_template"`
|
||||
ImageCount int `json:"image_count,omitempty"`
|
||||
}
|
||||
|
||||
// DebugTemplateResponse is returned when _debug_render_only is set to true
|
||||
type DebugTemplateResponse struct {
|
||||
Model string `json:"model"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
DebugInfo DebugInfo `json:"_debug_info"`
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
TotalDuration time.Duration `json:"total_duration,omitempty"`
|
||||
LoadDuration time.Duration `json:"load_duration,omitempty"`
|
||||
|
||||
@@ -1612,7 +1612,6 @@ func NewCLI() *cobra.Command {
|
||||
appendEnvDocs(cmd, []envconfig.EnvVar{
|
||||
envVars["OLLAMA_DEBUG"],
|
||||
envVars["OLLAMA_HOST"],
|
||||
envVars["OLLAMA_CONTEXT_LENGTH"],
|
||||
envVars["OLLAMA_KEEP_ALIVE"],
|
||||
envVars["OLLAMA_MAX_LOADED_MODELS"],
|
||||
envVars["OLLAMA_MAX_QUEUE"],
|
||||
|
||||
21
convert/bfloat16/bfloat16.go
Normal file
21
convert/bfloat16/bfloat16.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package bfloat16
|
||||
|
||||
import "math"
|
||||
|
||||
// FromFloat32s converts a slice of float32 values to a slice of bfloat16 values, represented as uint16s.
|
||||
func FromFloat32s(f32s []float32) (u16s []uint16) {
|
||||
u16s = make([]uint16, len(f32s))
|
||||
for i := range f32s {
|
||||
u16s[i] = uint16(math.Float32bits(f32s[i]) >> 16)
|
||||
}
|
||||
return u16s
|
||||
}
|
||||
|
||||
// Float32s converts a slice of bfloat16 values, represented as uint16s, back to a slice of float32 values.
|
||||
func Float32s(u16s []uint16) (f32s []float32) {
|
||||
f32s = make([]float32, len(u16s))
|
||||
for i := range u16s {
|
||||
f32s[i] = math.Float32frombits(uint32(u16s[i]) << 16)
|
||||
}
|
||||
return f32s
|
||||
}
|
||||
82
convert/bfloat16/bfloat16_test.go
Normal file
82
convert/bfloat16/bfloat16_test.go
Normal file
@@ -0,0 +1,82 @@
|
||||
package bfloat16
|
||||
|
||||
import (
|
||||
"math"
|
||||
"math/rand/v2"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func TestBfloat16(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
input uint16
|
||||
want uint32
|
||||
}{
|
||||
// Zero cases
|
||||
{"positive zero", 0x0000, 0x0},
|
||||
{"negative zero", 0x8000, 0x80000000},
|
||||
|
||||
// Normal numbers
|
||||
{"one", 0x3F80, 0x3F800000},
|
||||
{"negative one", 0xBF80, 0xBF800000},
|
||||
{"two", 0x4000, 0x40000000},
|
||||
{"half", 0x3F00, 0x3F000000},
|
||||
{"quarter", 0x3E80, 0x3E800000},
|
||||
{"max finite", 0x7F7F, 0x7F7F0000},
|
||||
{"min positive normal", 0x0080, 0x00800000},
|
||||
|
||||
// Infinity cases
|
||||
{"positive infinity", 0x7F80, 0x7F800000},
|
||||
{"negative infinity", 0xFF80, 0xFF800000},
|
||||
|
||||
// NaN cases
|
||||
{"NaN", 0x7FC0, 0x7FC00000},
|
||||
{"NaN with payload", 0x7FC1, 0x7FC10000},
|
||||
|
||||
// Subnormal cases
|
||||
{"min positive subnormal", 0x0001, 0x00010000},
|
||||
{"max subnormal", 0x007F, 0x007F0000},
|
||||
|
||||
// Powers of 2
|
||||
{"2^10", 0x4480, 0x44800000},
|
||||
{"2^-10", 0x3A80, 0x3A800000},
|
||||
{"2^20", 0x4B80, 0x4B800000},
|
||||
|
||||
// Common approximations in BF16
|
||||
{"pi approximation", 0x4049, 0x40490000},
|
||||
{"e approximation", 0x402E, 0x402E0000},
|
||||
{"sqrt(2) approximation", 0x3FB5, 0x3FB50000},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("Float32s", func(t *testing.T) {
|
||||
got := Float32s([]uint16{tt.input})[0]
|
||||
if diff := cmp.Diff(tt.want, math.Float32bits(got)); diff != "" {
|
||||
t.Errorf("Float32s mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("FromFloat32s", func(t *testing.T) {
|
||||
got := FromFloat32s([]float32{math.Float32frombits(tt.want)})
|
||||
if diff := cmp.Diff([]uint16{tt.input}, got); diff != "" {
|
||||
t.Errorf("FromFloat32s mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkBfloat16(b *testing.B) {
|
||||
f32s := make([]float32, 1_000_000)
|
||||
for i := range f32s {
|
||||
f32s[i] = rand.Float32()
|
||||
}
|
||||
for b.Loop() {
|
||||
Float32s(FromFloat32s(f32s))
|
||||
}
|
||||
}
|
||||
97
convert/float16/float16.go
Normal file
97
convert/float16/float16.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package float16
|
||||
|
||||
import (
|
||||
"math"
|
||||
)
|
||||
|
||||
func FromFloat32s(f32s []float32) (u16s []uint16) {
|
||||
u16s = make([]uint16, len(f32s))
|
||||
for i := range f32s {
|
||||
bits := math.Float32bits(f32s[i])
|
||||
sign := (bits >> 31) & 0x1
|
||||
exponent := (bits >> 23) & 0xFF
|
||||
mantissa := bits & 0x7FFFFF
|
||||
if exponent == 0xFF {
|
||||
if mantissa == 0 {
|
||||
// Infinity
|
||||
u16s[i] = uint16((sign << 15) | 0x7C00)
|
||||
} else {
|
||||
// NaN
|
||||
u16s[i] = uint16((sign << 15) | 0x7C00 | (mantissa >> 13))
|
||||
}
|
||||
} else if exponent == 0 && mantissa == 0 {
|
||||
// Zero
|
||||
u16s[i] = uint16(sign << 15)
|
||||
} else {
|
||||
// Convert exponent from FP32 bias (127) to FP16 bias (15)
|
||||
exponent := int(exponent) - 127 + 15
|
||||
if exponent >= 31 {
|
||||
// Overflow to infinity
|
||||
u16s[i] = uint16((sign << 15) | 0x7C00)
|
||||
} else if exponent <= 0 {
|
||||
// Underflow - create subnormal or zero
|
||||
if exponent < -10 {
|
||||
u16s[i] = uint16(sign << 15) // Zero
|
||||
} else {
|
||||
// Subnormal number
|
||||
mantissa = (mantissa | 0x800000) >> uint(-exponent+1)
|
||||
u16s[i] = uint16((sign << 15) | (mantissa >> 13))
|
||||
}
|
||||
} else {
|
||||
// Normal number - truncate mantissa from 23 to 10 bits
|
||||
u16s[i] = uint16((sign << 15) | (uint32(exponent) << 10) | (mantissa >> 13))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return u16s
|
||||
}
|
||||
|
||||
func Float32s(u16s []uint16) (f32s []float32) {
|
||||
f32s = make([]float32, len(u16s))
|
||||
for i := range u16s {
|
||||
sign := (u16s[i] >> 15) & 0x1
|
||||
exponent := (u16s[i] >> 10) & 0x1F
|
||||
mantissa := u16s[i] & 0x3FF
|
||||
|
||||
var u32 uint32
|
||||
switch exponent {
|
||||
case 0:
|
||||
if mantissa == 0 {
|
||||
// Zero
|
||||
u32 = uint32(sign) << 31
|
||||
} else {
|
||||
// Subnormal - convert to normal
|
||||
// Find leading 1 bit
|
||||
shift := 0
|
||||
temp := mantissa
|
||||
for temp&0x400 == 0 {
|
||||
temp <<= 1
|
||||
shift++
|
||||
}
|
||||
|
||||
exponent := 127 - 15 + 1 - shift
|
||||
mantissa := (uint32(temp&0x3FF) << 13)
|
||||
|
||||
u32 = (uint32(sign) << 31) | (uint32(exponent) << 23) | mantissa
|
||||
}
|
||||
case 0x1F:
|
||||
if mantissa == 0 {
|
||||
// Infinity
|
||||
u32 = (uint32(sign) << 31) | 0x7F800000
|
||||
} else {
|
||||
// NaN
|
||||
u32 = (uint32(sign) << 31) | 0x7F800000 | (uint32(mantissa) << 13)
|
||||
}
|
||||
default:
|
||||
// Normal number
|
||||
exponent := uint32(exponent) - 15 + 127
|
||||
mantissa := uint32(mantissa) << 13
|
||||
|
||||
u32 = (uint32(sign) << 31) | (exponent << 23) | mantissa
|
||||
}
|
||||
|
||||
f32s[i] = math.Float32frombits(u32)
|
||||
}
|
||||
return f32s
|
||||
}
|
||||
75
convert/float16/float16_test.go
Normal file
75
convert/float16/float16_test.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package float16
|
||||
|
||||
import (
|
||||
"math"
|
||||
"math/rand/v2"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func TestFloat16(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
input uint16
|
||||
want uint32
|
||||
}{
|
||||
// Zero cases
|
||||
{"positive zero", 0x0000, 0x0},
|
||||
{"negative zero", 0x8000, 0x80000000},
|
||||
|
||||
// Normal numbers
|
||||
{"one", 0x3C00, 0x3F800000},
|
||||
{"negative one", 0xBC00, 0xBF800000},
|
||||
{"two", 0x4000, 0x40000000},
|
||||
{"half", 0x3800, 0x3F000000},
|
||||
{"max normal", 0x7BFF, 0x477fe000},
|
||||
{"min positive normal", 0x0400, 0x38800000},
|
||||
|
||||
// Infinity cases
|
||||
{"positive infinity", 0x7C00, 0x7F800000},
|
||||
{"negative infinity", 0xFC00, 0xFF800000},
|
||||
|
||||
// NaN cases
|
||||
{"NaN", 0x7C01, 0x7f802000},
|
||||
{"NaN with payload", 0x7E00, 0x7FC00000},
|
||||
|
||||
// Subnormal cases
|
||||
{"min positive subnormal", 0x0001, 0x33800000},
|
||||
{"max subnormal", 0x03FF, 0x387fc000},
|
||||
|
||||
// Common values
|
||||
{"pi approximation", 0x4248, 0x40490000},
|
||||
{"e approximation", 0x416F, 0x402de000},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("Float32s", func(t *testing.T) {
|
||||
got := Float32s([]uint16{tt.input})[0]
|
||||
if diff := cmp.Diff(tt.want, math.Float32bits(got)); diff != "" {
|
||||
t.Errorf("Float32s mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("FromFloat32s", func(t *testing.T) {
|
||||
got := FromFloat32s([]float32{math.Float32frombits(tt.want)})
|
||||
if diff := cmp.Diff([]uint16{tt.input}, got); diff != "" {
|
||||
t.Errorf("FromFloat32s mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkFloat16(b *testing.B) {
|
||||
f32s := make([]float32, 1_000_000)
|
||||
for i := range f32s {
|
||||
f32s[i] = rand.Float32()
|
||||
}
|
||||
for b.Loop() {
|
||||
Float32s(FromFloat32s(f32s))
|
||||
}
|
||||
}
|
||||
@@ -13,8 +13,8 @@ import (
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/d4l3k/go-bfloat16"
|
||||
"github.com/x448/float16"
|
||||
"github.com/ollama/ollama/convert/bfloat16"
|
||||
"github.com/ollama/ollama/convert/float16"
|
||||
)
|
||||
|
||||
type safetensorMetadata struct {
|
||||
@@ -163,18 +163,16 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
f32s = make([]float32, len(u16s))
|
||||
for i := range u16s {
|
||||
f32s[i] = float16.Frombits(u16s[i]).Float32()
|
||||
}
|
||||
f32s = float16.Float32s(u16s)
|
||||
|
||||
case "BF16":
|
||||
u8s := make([]uint8, st.size)
|
||||
if err = binary.Read(br, binary.LittleEndian, u8s); err != nil {
|
||||
u16s := make([]uint16, st.size/2)
|
||||
if err = binary.Read(br, binary.LittleEndian, u16s); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
f32s = bfloat16.DecodeFloat32(u8s)
|
||||
f32s = bfloat16.Float32s(u16s)
|
||||
|
||||
default:
|
||||
return 0, fmt.Errorf("unknown data type: %s", st.dtype)
|
||||
}
|
||||
@@ -190,15 +188,9 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
|
||||
case tensorKindFP32:
|
||||
return 0, binary.Write(w, binary.LittleEndian, f32s)
|
||||
case tensorKindFP16:
|
||||
f16s := make([]uint16, len(f32s))
|
||||
for i := range f32s {
|
||||
f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
|
||||
}
|
||||
|
||||
return 0, binary.Write(w, binary.LittleEndian, f16s)
|
||||
return 0, binary.Write(w, binary.LittleEndian, float16.FromFloat32s(f32s))
|
||||
case tensorKindBF16:
|
||||
u8s := bfloat16.EncodeFloat32(f32s)
|
||||
return 0, binary.Write(w, binary.LittleEndian, u8s)
|
||||
return 0, binary.Write(w, binary.LittleEndian, bfloat16.FromFloat32s(f32s))
|
||||
default:
|
||||
return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
|
||||
}
|
||||
|
||||
@@ -7,9 +7,9 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/d4l3k/go-bfloat16"
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/x448/float16"
|
||||
"github.com/ollama/ollama/convert/bfloat16"
|
||||
"github.com/ollama/ollama/convert/float16"
|
||||
)
|
||||
|
||||
func TestSafetensors(t *testing.T) {
|
||||
@@ -21,6 +21,11 @@ func TestSafetensors(t *testing.T) {
|
||||
}
|
||||
defer root.Close()
|
||||
|
||||
f32s := make([]float32, 32)
|
||||
for i := range f32s {
|
||||
f32s[i] = float32(i)
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
name,
|
||||
dtype string
|
||||
@@ -36,11 +41,6 @@ func TestSafetensors(t *testing.T) {
|
||||
size: 32 * 4, // 32 floats, each 4 bytes
|
||||
shape: []uint64{32},
|
||||
setup: func(t *testing.T, f *os.File) {
|
||||
f32s := make([]float32, 32)
|
||||
for i := range f32s {
|
||||
f32s[i] = float32(i)
|
||||
}
|
||||
|
||||
if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -62,11 +62,6 @@ func TestSafetensors(t *testing.T) {
|
||||
size: 32 * 4, // 32 floats, each 4 bytes
|
||||
shape: []uint64{16, 2},
|
||||
setup: func(t *testing.T, f *os.File) {
|
||||
f32s := make([]float32, 32)
|
||||
for i := range f32s {
|
||||
f32s[i] = float32(i)
|
||||
}
|
||||
|
||||
if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -84,12 +79,7 @@ func TestSafetensors(t *testing.T) {
|
||||
size: 32 * 2, // 32 floats, each 2 bytes
|
||||
shape: []uint64{16, 2},
|
||||
setup: func(t *testing.T, f *os.File) {
|
||||
u16s := make([]uint16, 32)
|
||||
for i := range u16s {
|
||||
u16s[i] = float16.Fromfloat32(float32(i)).Bits()
|
||||
}
|
||||
|
||||
if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
|
||||
if err := binary.Write(f, binary.LittleEndian, float16.FromFloat32s(f32s)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
},
|
||||
@@ -106,12 +96,7 @@ func TestSafetensors(t *testing.T) {
|
||||
size: 32 * 2, // 32 floats, each 2 bytes
|
||||
shape: []uint64{32},
|
||||
setup: func(t *testing.T, f *os.File) {
|
||||
u16s := make([]uint16, 32)
|
||||
for i := range u16s {
|
||||
u16s[i] = float16.Fromfloat32(float32(i)).Bits()
|
||||
}
|
||||
|
||||
if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
|
||||
if err := binary.Write(f, binary.LittleEndian, float16.FromFloat32s(f32s)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
},
|
||||
@@ -132,12 +117,7 @@ func TestSafetensors(t *testing.T) {
|
||||
size: 32 * 2, // 32 brain floats, each 2 bytes
|
||||
shape: []uint64{16, 2},
|
||||
setup: func(t *testing.T, f *os.File) {
|
||||
f32s := make([]float32, 32)
|
||||
for i := range f32s {
|
||||
f32s[i] = float32(i)
|
||||
}
|
||||
|
||||
if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
|
||||
if err := binary.Write(f, binary.LittleEndian, bfloat16.FromFloat32s(f32s)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
},
|
||||
@@ -154,12 +134,7 @@ func TestSafetensors(t *testing.T) {
|
||||
size: 32 * 2, // 32 brain floats, each 2 bytes
|
||||
shape: []uint64{32},
|
||||
setup: func(t *testing.T, f *os.File) {
|
||||
f32s := make([]float32, 32)
|
||||
for i := range f32s {
|
||||
f32s[i] = float32(i)
|
||||
}
|
||||
|
||||
if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
|
||||
if err := binary.Write(f, binary.LittleEndian, bfloat16.FromFloat32s(f32s)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
},
|
||||
|
||||
@@ -97,7 +97,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
return a < b
|
||||
})
|
||||
gpuCount := 0
|
||||
gpuOrdinalID := 0
|
||||
for _, match := range matches {
|
||||
slog.Debug("evaluating amdgpu node " + match)
|
||||
fp, err := os.Open(match)
|
||||
@@ -188,6 +187,10 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Keep track of numeric IDs based on valid GPUs
|
||||
gpuID := gpuCount
|
||||
gpuCount += 1
|
||||
|
||||
// Look up the memory for the current node
|
||||
totalMemory := uint64(0)
|
||||
usedMemory := uint64(0)
|
||||
@@ -266,7 +269,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
if uniqueID != 0 {
|
||||
ID = fmt.Sprintf("GPU-%016x", uniqueID)
|
||||
} else {
|
||||
ID = strconv.Itoa(gpuOrdinalID)
|
||||
ID = strconv.Itoa(gpuID)
|
||||
}
|
||||
|
||||
gpuInfo := RocmGPUInfo{
|
||||
@@ -284,40 +287,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
DriverMinor: driverMinor,
|
||||
},
|
||||
usedFilepath: usedFile,
|
||||
index: gpuCount,
|
||||
index: gpuID,
|
||||
}
|
||||
|
||||
// Keep track of numeric IDs based on valid GPUs
|
||||
gpuCount += 1
|
||||
|
||||
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
||||
if len(visibleDevices) > 0 {
|
||||
include := false
|
||||
for _, visible := range visibleDevices {
|
||||
if (uniqueID != 0 && visible == gpuInfo.ID) || visible == strconv.Itoa(gpuInfo.index) {
|
||||
include = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !include {
|
||||
reason := "filtering out device per user request"
|
||||
slog.Info(reason, "id", gpuInfo.ID, "index", gpuInfo.index, "visible_devices", visibleDevices)
|
||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
||||
GpuInfo: gpuInfo.GpuInfo,
|
||||
Reason: reason,
|
||||
})
|
||||
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Ordinal IDs are based on the visible GPUs
|
||||
gpuOrdinalID += 1
|
||||
|
||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||
if totalMemory < IGPUMemLimit {
|
||||
reason := "unsupported Radeon iGPU detected skipping"
|
||||
slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
|
||||
slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
|
||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
||||
GpuInfo: gpuInfo.GpuInfo,
|
||||
Reason: reason,
|
||||
@@ -330,7 +306,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
}
|
||||
if int(major) < minVer {
|
||||
reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
|
||||
slog.Warn(reason, "gpu", gpuInfo.ID)
|
||||
slog.Warn(reason, "gpu", gpuID)
|
||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
||||
GpuInfo: gpuInfo.GpuInfo,
|
||||
Reason: reason,
|
||||
@@ -339,8 +315,29 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
|
||||
slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
||||
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
||||
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
||||
|
||||
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
||||
if len(visibleDevices) > 0 {
|
||||
include := false
|
||||
for _, visible := range visibleDevices {
|
||||
if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
|
||||
include = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !include {
|
||||
reason := "filtering out device per user request"
|
||||
slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
|
||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
||||
GpuInfo: gpuInfo.GpuInfo,
|
||||
Reason: reason,
|
||||
})
|
||||
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Final validation is gfx compatibility - load the library if we haven't already loaded it
|
||||
// even if the user overrides, we still need to validate the library
|
||||
|
||||
@@ -75,7 +75,7 @@ for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
|
||||
import { Ollama } from 'ollama';
|
||||
|
||||
const ollama = new Ollama({
|
||||
host: 'https://ollama.com',
|
||||
host: 'https://ollama.com'
|
||||
headers: {
|
||||
Authorization: "Bearer <api key>"
|
||||
}
|
||||
|
||||
@@ -185,8 +185,6 @@ var (
|
||||
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
|
||||
// Auth enables authentication between the Ollama client and server
|
||||
UseAuth = Bool("OLLAMA_AUTH")
|
||||
// Enable the new memory estimation logic
|
||||
NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
@@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar {
|
||||
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
||||
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
|
||||
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
||||
"OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},
|
||||
|
||||
// Informational
|
||||
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
||||
|
||||
@@ -480,8 +480,6 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
||||
}
|
||||
|
||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||
context *= uint64(numParallel)
|
||||
|
||||
embedding := f.KV().EmbeddingLength()
|
||||
heads := f.KV().HeadCountMax()
|
||||
headsKV := f.KV().HeadCountKVMax()
|
||||
@@ -752,11 +750,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
||||
|
||||
// SupportsKVCacheType checks if the requested cache type is supported
|
||||
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
|
||||
// gpt-oss uses attention with sinks which does not support quantized cache types
|
||||
slog.Warn("model only supports non-quantized cache types ", "mode", arch)
|
||||
return cacheType == "f16"
|
||||
}
|
||||
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
|
||||
}
|
||||
|
||||
|
||||
2
go.mod
2
go.mod
@@ -10,13 +10,11 @@ require (
|
||||
github.com/olekukonko/tablewriter v0.0.5
|
||||
github.com/spf13/cobra v1.7.0
|
||||
github.com/stretchr/testify v1.9.0
|
||||
github.com/x448/float16 v0.8.4
|
||||
golang.org/x/sync v0.12.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/agnivade/levenshtein v1.1.1
|
||||
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
|
||||
github.com/dlclark/regexp2 v1.11.4
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
||||
github.com/google/go-cmp v0.7.0
|
||||
|
||||
4
go.sum
4
go.sum
@@ -35,8 +35,6 @@ github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARu
|
||||
github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
|
||||
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
@@ -197,8 +195,6 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
|
||||
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
|
||||
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
|
||||
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
|
||||
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
|
||||
github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
|
||||
github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
|
||||
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math"
|
||||
"math/rand"
|
||||
"os"
|
||||
"strconv"
|
||||
"sync"
|
||||
@@ -17,157 +16,245 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
// Send multiple requests in parallel (concurrently) to a single model and ensure responses are expected
|
||||
func TestConcurrentGenerate(t *testing.T) {
|
||||
// Assumes all requests have the same model
|
||||
req, resp := GenerateRequests()
|
||||
numParallel := int(envconfig.NumParallel() + 1)
|
||||
iterLimit := 3
|
||||
func TestMultiModelConcurrency(t *testing.T) {
|
||||
var (
|
||||
req = [2]api.GenerateRequest{
|
||||
{
|
||||
Model: smol,
|
||||
Prompt: "why is the ocean blue?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]any{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: "qwen3:0.6b",
|
||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]any{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
},
|
||||
}
|
||||
resp = [2][]string{
|
||||
{"sunlight"},
|
||||
{"england", "english", "massachusetts", "pilgrims", "british", "festival"},
|
||||
}
|
||||
)
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(req))
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
|
||||
defer cancel()
|
||||
|
||||
softTimeout, hardTimeout := getTimeouts(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
for i := 0; i < len(req); i++ {
|
||||
require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
|
||||
}
|
||||
|
||||
for i := 0; i < len(req); i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
// Note: CPU based inference can crawl so don't give up too quickly
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestIntegrationConcurrentPredict(t *testing.T) {
|
||||
req, resp := GenerateRequests()
|
||||
reqLimit := len(req)
|
||||
iterLimit := 5
|
||||
|
||||
if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
|
||||
maxVram, err := strconv.ParseUint(s, 10, 64)
|
||||
require.NoError(t, err)
|
||||
// Don't hammer on small VRAM cards...
|
||||
if maxVram < 4*format.GibiByte {
|
||||
reqLimit = min(reqLimit, 2)
|
||||
iterLimit = 2
|
||||
}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
// Get the server running (if applicable) warm the model up with a single initial request
|
||||
slog.Info("loading", "model", req[0].Model)
|
||||
err := client.Generate(ctx,
|
||||
&api.GenerateRequest{Model: req[0].Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
|
||||
func(response api.GenerateResponse) error { return nil },
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to load model %s: %s", req[0].Model, err)
|
||||
}
|
||||
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
r := rand.New(rand.NewSource(0))
|
||||
wg.Add(numParallel)
|
||||
for i := range numParallel {
|
||||
wg.Add(reqLimit)
|
||||
for i := 0; i < reqLimit; i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
for j := 0; j < iterLimit; j++ {
|
||||
if time.Now().Sub(started) > softTimeout {
|
||||
slog.Info("exceeded soft timeout, winding down test")
|
||||
return
|
||||
}
|
||||
k := r.Int() % len(req)
|
||||
slog.Info("Starting", "thread", i, "iter", j)
|
||||
slog.Info("Starting", "req", i, "iter", j)
|
||||
// On slower GPUs it can take a while to process the concurrent requests
|
||||
// so we allow a much longer initial timeout
|
||||
DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// Stress the scheduler and attempt to load more models than will fit to cause thrashing
|
||||
// This test will always load at least 2 models even on CPU based systems
|
||||
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
|
||||
func TestMultiModelStress(t *testing.T) {
|
||||
s := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
|
||||
if s == "" {
|
||||
s = "0"
|
||||
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
|
||||
}
|
||||
|
||||
maxVram, err := strconv.ParseUint(s, 10, 64)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
smallModels := []string{
|
||||
"llama3.2:1b",
|
||||
"qwen3:0.6b",
|
||||
"gemma:2b",
|
||||
"deepseek-r1:1.5b",
|
||||
"starcoder2:3b",
|
||||
}
|
||||
mediumModels := []string{
|
||||
"qwen3:8b",
|
||||
"llama2",
|
||||
"deepseek-r1:7b",
|
||||
"mistral",
|
||||
"dolphin-mistral",
|
||||
"gemma:7b",
|
||||
"codellama:7b",
|
||||
if maxVram < 2*format.GibiByte {
|
||||
t.Skip("VRAM less than 2G, skipping model stress tests")
|
||||
}
|
||||
|
||||
var chosenModels []string
|
||||
type model struct {
|
||||
name string
|
||||
size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
|
||||
}
|
||||
|
||||
smallModels := []model{
|
||||
{
|
||||
name: "llama3.2:1b",
|
||||
size: 2876 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "qwen3:0.6b",
|
||||
size: 1600 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "gemma:2b",
|
||||
size: 2364 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "deepseek-r1:1.5b",
|
||||
size: 2048 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "starcoder2:3b",
|
||||
size: 2166 * format.MebiByte,
|
||||
},
|
||||
}
|
||||
mediumModels := []model{
|
||||
{
|
||||
name: "qwen3:8b",
|
||||
size: 6600 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "llama2",
|
||||
size: 5118 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "deepseek-r1:7b",
|
||||
size: 5600 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "mistral",
|
||||
size: 4620 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "dolphin-mistral",
|
||||
size: 4620 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "gemma:7b",
|
||||
size: 5000 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "codellama:7b",
|
||||
size: 5118 * format.MebiByte,
|
||||
},
|
||||
}
|
||||
|
||||
// These seem to be too slow to be useful...
|
||||
// largeModels := []model{
|
||||
// {
|
||||
// name: "llama2:13b",
|
||||
// size: 7400 * format.MebiByte,
|
||||
// },
|
||||
// {
|
||||
// name: "codellama:13b",
|
||||
// size: 7400 * format.MebiByte,
|
||||
// },
|
||||
// {
|
||||
// name: "orca-mini:13b",
|
||||
// size: 7400 * format.MebiByte,
|
||||
// },
|
||||
// {
|
||||
// name: "gemma:7b",
|
||||
// size: 5000 * format.MebiByte,
|
||||
// },
|
||||
// {
|
||||
// name: "starcoder2:15b",
|
||||
// size: 9100 * format.MebiByte,
|
||||
// },
|
||||
// }
|
||||
|
||||
var chosenModels []model
|
||||
switch {
|
||||
case maxVram < 10000*format.MebiByte:
|
||||
slog.Info("selecting small models")
|
||||
chosenModels = smallModels
|
||||
// case maxVram < 30000*format.MebiByte:
|
||||
default:
|
||||
slog.Info("selecting medium models")
|
||||
chosenModels = mediumModels
|
||||
// default:
|
||||
// slog.Info("selecting large models")
|
||||
// chosenModels = largeModels
|
||||
}
|
||||
|
||||
softTimeout, hardTimeout := getTimeouts(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
||||
req, resp := GenerateRequests()
|
||||
|
||||
for i := range req {
|
||||
if i > len(chosenModels) {
|
||||
break
|
||||
}
|
||||
req[i].Model = chosenModels[i].name
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
// Make sure all the models are pulled before we get started
|
||||
for _, model := range chosenModels {
|
||||
require.NoError(t, PullIfMissing(ctx, client, model))
|
||||
for _, r := range req {
|
||||
require.NoError(t, PullIfMissing(ctx, client, r.Model))
|
||||
}
|
||||
|
||||
// Determine how many models we can load in parallel before we exceed VRAM
|
||||
// The intent is to go 1 over what can fit so we force the scheduler to thrash
|
||||
targetLoadCount := 0
|
||||
slog.Info("Loading models to find how many can fit in VRAM before overflowing")
|
||||
for i, model := range chosenModels {
|
||||
req := &api.GenerateRequest{Model: model}
|
||||
slog.Info("loading", "model", model)
|
||||
err = client.Generate(ctx, req, func(response api.GenerateResponse) error { return nil })
|
||||
if err != nil {
|
||||
t.Fatalf("failed to load model %s: %s", model, err)
|
||||
}
|
||||
targetLoadCount++
|
||||
if i > 0 {
|
||||
models, err := client.ListRunning(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to list running models: %s", err)
|
||||
}
|
||||
if len(models.Models) < targetLoadCount {
|
||||
loaded := []string{}
|
||||
for _, m := range models.Models {
|
||||
loaded = append(loaded, m.Name)
|
||||
}
|
||||
slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if targetLoadCount == len(chosenModels) {
|
||||
// TODO consider retrying the medium models
|
||||
slog.Warn("all models being used without exceeding VRAM, set OLLAMA_MAX_VRAM so test can pick larger models")
|
||||
}
|
||||
|
||||
r := rand.New(rand.NewSource(0))
|
||||
var wg sync.WaitGroup
|
||||
for i := range targetLoadCount {
|
||||
consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
|
||||
for i := 0; i < len(req); i++ {
|
||||
// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
|
||||
if i > 1 && consumed > maxVram {
|
||||
slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
|
||||
break
|
||||
}
|
||||
consumed += chosenModels[i].size
|
||||
slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
|
||||
|
||||
wg.Add(1)
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
reqs, resps := GenerateRequests()
|
||||
for j := 0; j < 3; j++ {
|
||||
if time.Now().Sub(started) > softTimeout {
|
||||
slog.Info("exceeded soft timeout, winding down test")
|
||||
return
|
||||
}
|
||||
k := r.Int() % len(reqs)
|
||||
reqs[k].Model = chosenModels[i]
|
||||
slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Prompt)
|
||||
DoGenerate(ctx, t, client, reqs[k], resps[k],
|
||||
120*time.Second, // Be extra patient for the model to load initially
|
||||
10*time.Second, // Once results start streaming, fail if they stall
|
||||
)
|
||||
slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
|
||||
@@ -4,8 +4,6 @@ package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -65,51 +63,3 @@ func TestContextExhaustion(t *testing.T) {
|
||||
}
|
||||
DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
|
||||
}
|
||||
|
||||
// Send multiple requests with prior context and ensure the response is coherant and expected
|
||||
func TestGenerateWithHistory(t *testing.T) {
|
||||
modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
|
||||
req, resp := GenerateRequests()
|
||||
numParallel := 2
|
||||
iterLimit := 2
|
||||
|
||||
softTimeout, hardTimeout := getTimeouts(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
// Get the server running (if applicable) warm the model up with a single initial request
|
||||
slog.Info("loading", "model", modelOverride)
|
||||
err := client.Generate(ctx,
|
||||
&api.GenerateRequest{Model: modelOverride, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
|
||||
func(response api.GenerateResponse) error { return nil },
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to load model %s: %s", modelOverride, err)
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(numParallel)
|
||||
for i := range numParallel {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
k := i % len(req)
|
||||
req[k].Model = modelOverride
|
||||
for j := 0; j < iterLimit; j++ {
|
||||
if time.Now().Sub(started) > softTimeout {
|
||||
slog.Info("exceeded soft timeout, winding down test")
|
||||
return
|
||||
}
|
||||
slog.Info("Starting", "thread", i, "iter", j)
|
||||
// On slower GPUs it can take a while to process the concurrent requests
|
||||
// so we allow a much longer initial timeout
|
||||
c := DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
|
||||
req[k].Context = c
|
||||
req[k].Prompt = "tell me more!"
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
}
|
||||
|
||||
23
integration/testdata/embed.json
vendored
23
integration/testdata/embed.json
vendored
File diff suppressed because one or more lines are too long
@@ -472,19 +472,15 @@ func GenerateTestHelper(ctx context.Context, t *testing.T, genReq api.GenerateRe
|
||||
DoGenerate(ctx, t, client, genReq, anyResp, 30*time.Second, 10*time.Second)
|
||||
}
|
||||
|
||||
func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) []int {
|
||||
func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) {
|
||||
stallTimer := time.NewTimer(initialTimeout)
|
||||
var buf bytes.Buffer
|
||||
var context []int
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
// fmt.Print(".")
|
||||
buf.Write([]byte(response.Response))
|
||||
if !stallTimer.Reset(streamTimeout) {
|
||||
return errors.New("stall was detected while streaming response, aborting")
|
||||
}
|
||||
if len(response.Context) > 0 {
|
||||
context = response.Context
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -507,7 +503,7 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
|
||||
case <-done:
|
||||
if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
|
||||
slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
|
||||
return context
|
||||
return
|
||||
}
|
||||
require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
|
||||
// Verify the response contains the expected data
|
||||
@@ -524,7 +520,6 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
|
||||
case <-ctx.Done():
|
||||
t.Error("outer test context done while waiting for generate")
|
||||
}
|
||||
return context
|
||||
}
|
||||
|
||||
// Generate a set of requests
|
||||
@@ -533,35 +528,55 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
|
||||
return []api.GenerateRequest{
|
||||
{
|
||||
Model: smol,
|
||||
Prompt: "why is the ocean blue? Be brief but factual in your reply",
|
||||
Prompt: "why is the ocean blue?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]any{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: smol,
|
||||
Prompt: "why is the color of dirt brown? Be brief but factual in your reply",
|
||||
Prompt: "why is the color of dirt brown?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]any{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: smol,
|
||||
Prompt: "what is the origin of the US thanksgiving holiday? Be brief but factual in your reply",
|
||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]any{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: smol,
|
||||
Prompt: "what is the origin of independence day? Be brief but factual in your reply",
|
||||
Prompt: "what is the origin of independence day?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]any{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: smol,
|
||||
Prompt: "what is the composition of air? Be brief but factual in your reply",
|
||||
Prompt: "what is the composition of air?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]any{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
[][]string{
|
||||
{"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"},
|
||||
{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"},
|
||||
{"england", "english", "massachusetts", "pilgrims", "colonists", "independence", "british", "feast", "family", "gatherings", "traditions", "turkey", "colonial", "period", "harvest", "agricultural", "european settlers", "american revolution", "civil war", "16th century", "17th century", "native american", "united states"},
|
||||
{"sunlight", "scattering", "interact"},
|
||||
{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles"},
|
||||
{"england", "english", "massachusetts", "pilgrims", "british"},
|
||||
{"fourth", "july", "declaration", "independence"},
|
||||
{"nitrogen", "oxygen", "carbon", "dioxide"},
|
||||
}
|
||||
|
||||
3
llama/llama.cpp/src/llama-context.cpp
vendored
3
llama/llama.cpp/src/llama-context.cpp
vendored
@@ -962,7 +962,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
const bool output_all = false;
|
||||
// when computing embeddings, all tokens are output
|
||||
const bool output_all = cparams.embeddings;
|
||||
|
||||
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
|
||||
@@ -62,22 +62,6 @@ func BackendInit() {
|
||||
C.llama_backend_init()
|
||||
}
|
||||
|
||||
func EnumerateGPUs() []string {
|
||||
var ids []string
|
||||
|
||||
for i := range C.ggml_backend_dev_count() {
|
||||
device := C.ggml_backend_dev_get(i)
|
||||
|
||||
if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
|
||||
var props C.struct_ggml_backend_dev_props
|
||||
C.ggml_backend_dev_get_props(device, &props)
|
||||
ids = append(ids, C.GoString(props.id))
|
||||
}
|
||||
}
|
||||
|
||||
return ids
|
||||
}
|
||||
|
||||
func GetModelArch(modelPath string) (string, error) {
|
||||
mp := C.CString(modelPath)
|
||||
defer C.free(unsafe.Pointer(mp))
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Hiltgen <daniel@ollama.com>
|
||||
Date: Sun, 22 Jun 2025 09:22:05 -0700
|
||||
Subject: [PATCH] temporary prevent rocm+cuda mixed loading
|
||||
|
||||
---
|
||||
ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
|
||||
1 file changed, 10 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 3040b2aa..f1e9c180 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
|
||||
ggml_backend_load_best("blas", silent, dir_path);
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
- ggml_backend_load_best("cuda", silent, dir_path);
|
||||
- ggml_backend_load_best("hip", silent, dir_path);
|
||||
+
|
||||
+ // Avoid mixed hip+cuda configurations
|
||||
+ const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
|
||||
+ const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
|
||||
+ if (!hip_devices && !rocr_devices) {
|
||||
+ ggml_backend_load_best("cuda", silent, dir_path);
|
||||
+ } else {
|
||||
+ ggml_backend_load_best("hip", silent, dir_path);
|
||||
+ }
|
||||
+
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
ggml_backend_load_best("sycl", silent, dir_path);
|
||||
@@ -13,7 +13,7 @@ checks.
|
||||
1 file changed, 18 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 57eae461..c7f9dc3a 100644
|
||||
index 57eae461..9db0c8b5 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
@@ -1,23 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <git@mxy.ng>
|
||||
Date: Mon, 18 Aug 2025 16:58:39 -0700
|
||||
Subject: [PATCH] decode: disable output_all
|
||||
|
||||
---
|
||||
src/llama-context.cpp | 3 +--
|
||||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 26a5cf9c..6ece5263 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
- // when computing embeddings, all tokens are output
|
||||
- const bool output_all = cparams.embeddings;
|
||||
+ const bool output_all = false;
|
||||
|
||||
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
@@ -4,7 +4,7 @@ import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
@@ -14,79 +14,13 @@ import (
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||
// The list of GPUs returned will always be the same brand (library)
|
||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
|
||||
for _, gl := range gpus.ByLibrary() {
|
||||
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
||||
|
||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
||||
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
|
||||
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
||||
|
||||
if !envconfig.SchedSpread() {
|
||||
// Try to pack into as few GPUs as possible, starting from 1 GPU
|
||||
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
|
||||
gpuSubset := sgl[:numGPUs]
|
||||
ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
|
||||
|
||||
if ok {
|
||||
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
|
||||
"model", modelPath,
|
||||
"library", sgl[0].Library,
|
||||
"parallel", numParallel,
|
||||
"required", format.HumanBytes2(estimatedVRAM),
|
||||
"gpus", numGPUs)
|
||||
return gpuSubset
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// TODO future refinements
|
||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
|
||||
if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading",
|
||||
"model", modelPath,
|
||||
"library", sgl[0].Library,
|
||||
"parallel", numParallel,
|
||||
"required", format.HumanBytes2(estimatedVRAM),
|
||||
"gpus", len(sgl))
|
||||
return sgl
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
|
||||
byLibrary := gpus.ByLibrary()
|
||||
if len(byLibrary) <= 1 {
|
||||
return gpus
|
||||
}
|
||||
var bestEstimate uint64
|
||||
var bestFit int
|
||||
for i, gl := range byLibrary {
|
||||
_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
|
||||
if estimatedVRAM > bestEstimate {
|
||||
bestEstimate = estimatedVRAM
|
||||
bestFit = i
|
||||
}
|
||||
}
|
||||
return byLibrary[bestFit]
|
||||
}
|
||||
|
||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||
func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
|
||||
// Split up the GPUs by type and try them
|
||||
var estimatedVRAM uint64
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
var layerCount int
|
||||
estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
|
||||
estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
|
||||
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
|
||||
if opts.NumGPU < 0 {
|
||||
if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
|
||||
@@ -115,7 +49,7 @@ type MemoryEstimate struct {
|
||||
TotalSize uint64
|
||||
|
||||
// For multi-GPU scenarios, this provides the tensor split parameter
|
||||
TensorSplit []int
|
||||
TensorSplit string
|
||||
|
||||
// For multi-GPU scenarios, this is the size in bytes per GPU
|
||||
GPUSizes []uint64
|
||||
@@ -137,7 +71,7 @@ type MemoryEstimate struct {
|
||||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
// The GPUs provided must all be the same Library
|
||||
func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
|
||||
func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
|
||||
// Graph size for a partial offload, applies to all GPUs
|
||||
var graphPartialOffload uint64
|
||||
|
||||
@@ -178,9 +112,13 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
|
||||
for _, projector := range projectors {
|
||||
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
if llamaEngineProjectorWeights == 0 {
|
||||
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
|
||||
layers := f.Tensors().GroupLayers()
|
||||
@@ -246,7 +184,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
|
||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||
var layerCount int
|
||||
tensorSplit := make([]int, len(gpus))
|
||||
layerCounts := make([]int, len(gpus))
|
||||
gpuAllocations := make([]uint64, len(gpus))
|
||||
type gs struct {
|
||||
i int
|
||||
@@ -310,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > overhead+used+layerSize {
|
||||
gpuAllocations[g.i] += layerSize
|
||||
tensorSplit[g.i]++
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
break
|
||||
} else {
|
||||
@@ -335,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > overhead+used+memoryLastLayer {
|
||||
gpuAllocations[g.i] += memoryLastLayer
|
||||
tensorSplit[g.i]++
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
break
|
||||
}
|
||||
@@ -350,7 +288,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
|
||||
// Add the applicable (full or partial) graph allocations
|
||||
for i := range gpus {
|
||||
if tensorSplit[i] <= 0 {
|
||||
if layerCounts[i] <= 0 {
|
||||
continue
|
||||
}
|
||||
if fullyLoaded {
|
||||
@@ -372,6 +310,14 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
}
|
||||
memoryRequiredTotal = memoryRequiredPartial + overflow
|
||||
|
||||
tensorSplit := ""
|
||||
if len(gpus) > 1 {
|
||||
splits := make([]string, len(gpus))
|
||||
for i, count := range layerCounts {
|
||||
splits[i] = strconv.Itoa(count)
|
||||
}
|
||||
tensorSplit = strings.Join(splits, ",")
|
||||
}
|
||||
allocationsList := []string{}
|
||||
for _, a := range gpuAllocations {
|
||||
allocationsList = append(allocationsList, format.HumanBytes2(a))
|
||||
|
||||
@@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
projectors := []string{}
|
||||
opts := api.DefaultOptions()
|
||||
t.Run("cpu", func(t *testing.T) {
|
||||
estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
|
||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
|
||||
assert.Equal(t, 0, estimate.Layers)
|
||||
assert.Equal(t, uint64(0), estimate.Graph)
|
||||
})
|
||||
@@ -88,7 +88,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
|
||||
for i, s := range []struct {
|
||||
layer0, layer1 uint64
|
||||
expect0, expect1 int
|
||||
expect0, expect1 uint64
|
||||
}{
|
||||
{1, 1, 1, 1},
|
||||
{2, 1, 2, 1},
|
||||
@@ -112,9 +112,9 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
|
||||
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
||||
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
||||
estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
|
||||
assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
|
||||
assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
|
||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
|
||||
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
|
||||
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
|
||||
var layerSums uint64
|
||||
for _, b := range estimate.GPUSizes {
|
||||
layerSums += b
|
||||
|
||||
1051
llm/server.go
1051
llm/server.go
File diff suppressed because it is too large
Load Diff
@@ -8,178 +8,9 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"golang.org/x/sync/semaphore"
|
||||
)
|
||||
|
||||
func TestLLMServerFitGPU(t *testing.T) {
|
||||
type gpu struct {
|
||||
library string
|
||||
free int
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
gpus []gpu
|
||||
layers []int
|
||||
numGPU int
|
||||
requireFull bool
|
||||
expected ml.GPULayersList
|
||||
expectedErr error
|
||||
}{
|
||||
{
|
||||
name: "No GPU",
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{},
|
||||
},
|
||||
{
|
||||
name: "Full single GPU",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
|
||||
},
|
||||
{
|
||||
name: "Partial single GPU",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
|
||||
},
|
||||
{
|
||||
name: "Single GPU with numGPU 1",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 1,
|
||||
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
|
||||
},
|
||||
{
|
||||
name: "Single GPU with numGPU 0",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 0,
|
||||
expected: ml.GPULayersList{},
|
||||
},
|
||||
{
|
||||
name: "Single GPU with numGPU 999",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||
numGPU: 999,
|
||||
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU fits on one",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU split",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU partial",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU numGPU 1",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU numGPU 2",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 2,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU numGPU 999",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 999,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU different libraries",
|
||||
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
|
||||
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
|
||||
},
|
||||
{
|
||||
name: "requireFull",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
requireFull: true,
|
||||
expectedErr: ErrLoadRequiredFull,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
var systemInfo discover.SystemInfo
|
||||
systemInfo.System.TotalMemory = format.GibiByte
|
||||
systemInfo.System.FreeMemory = 512 * format.MebiByte
|
||||
systemInfo.System.FreeSwap = 256 * format.MebiByte
|
||||
|
||||
gpus := make(discover.GpuInfoList, len(tt.gpus))
|
||||
for i := range tt.gpus {
|
||||
gpus[i].ID = fmt.Sprintf("gpu%d", i)
|
||||
gpus[i].Library = tt.gpus[i].library
|
||||
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
|
||||
}
|
||||
|
||||
s := &ollamaServer{
|
||||
llmServer: llmServer{
|
||||
totalLayers: uint64(len(tt.layers)),
|
||||
options: api.Options{
|
||||
Runner: api.Runner{
|
||||
NumGPU: tt.numGPU,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
||||
Weights: make([]ml.Memory, s.totalLayers),
|
||||
Cache: make([]ml.Memory, s.totalLayers),
|
||||
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
|
||||
|
||||
for i := range tt.layers {
|
||||
s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
|
||||
}
|
||||
|
||||
for i := range s.mem.GPUs {
|
||||
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
|
||||
s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
|
||||
s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
|
||||
}
|
||||
|
||||
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
|
||||
if err != tt.expectedErr {
|
||||
t.Fatalf("fitGPU returned error: %v", err)
|
||||
}
|
||||
if gpuLayers.Hash() != tt.expected.Hash() {
|
||||
t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestLLMServerCompletionFormat(t *testing.T) {
|
||||
// This test was written to fix an already deployed issue. It is a bit
|
||||
// of a mess, and but it's good enough, until we can refactoring the
|
||||
|
||||
162
ml/backend.go
162
ml/backend.go
@@ -5,14 +5,12 @@ import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"hash/maphash"
|
||||
"log/slog"
|
||||
"math"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs"
|
||||
)
|
||||
|
||||
@@ -60,89 +58,19 @@ type CacheConfig struct {
|
||||
MaskBatchPadding int
|
||||
}
|
||||
|
||||
// GPULayers is a set of layers to be allocated on a single GPU
|
||||
type GPULayers struct {
|
||||
// ID is the identifier of the GPU, as reported in DeviceMemory
|
||||
ID string
|
||||
|
||||
// Layers is a set of layer indicies to load
|
||||
Layers []int
|
||||
}
|
||||
|
||||
func (g GPULayers) String() string {
|
||||
if len(g.Layers) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
slices.Sort(g.Layers)
|
||||
|
||||
contiguous := true
|
||||
base := g.Layers[0]
|
||||
for i := range g.Layers {
|
||||
if g.Layers[i] != base+i {
|
||||
contiguous = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if contiguous {
|
||||
return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
|
||||
} else {
|
||||
return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
|
||||
}
|
||||
}
|
||||
|
||||
// GPULayersList is a set of layer allocations across multiple GPUs
|
||||
type GPULayersList []GPULayers
|
||||
|
||||
func (l GPULayersList) String() string {
|
||||
if l.Sum() > 0 {
|
||||
return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
|
||||
} else {
|
||||
return fmt.Sprintf("%v", []GPULayers(l))
|
||||
}
|
||||
}
|
||||
|
||||
// Sum is the total number of layers assigned across all GPUs
|
||||
func (l GPULayersList) Sum() int {
|
||||
var sum int
|
||||
|
||||
for _, g := range l {
|
||||
sum += len(g.Layers)
|
||||
}
|
||||
|
||||
return sum
|
||||
}
|
||||
|
||||
var h maphash.Hash
|
||||
|
||||
// Hash is an identifier of this layer assignment
|
||||
func (l GPULayersList) Hash() uint64 {
|
||||
h.Reset()
|
||||
for _, g := range l {
|
||||
if len(g.Layers) > 0 {
|
||||
h.WriteString(g.ID)
|
||||
for _, l := range g.Layers {
|
||||
binary.Write(&h, binary.NativeEndian, int64(l))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return h.Sum64()
|
||||
}
|
||||
|
||||
// BackendParams controls how the backend loads and executes models
|
||||
type BackendParams struct {
|
||||
// AllocMemory causes the backend to allocate memory for the model. If
|
||||
// false, this is only being used for discovering the required amount of
|
||||
// memory and cannot load the model for running.
|
||||
AllocMemory bool
|
||||
|
||||
// NumThreads sets the number of threads to use if running on the CPU
|
||||
NumThreads int
|
||||
|
||||
// GPULayers is the set of layers to offload to GPUs
|
||||
GPULayers GPULayersList
|
||||
// MainGPU is the index of the primary GPU to use
|
||||
MainGPU int
|
||||
|
||||
// NumGPULayers is the number of layers to offload to GPUs
|
||||
NumGPULayers int
|
||||
|
||||
// TensorSplit is the fraction of the model to offload to each GPU
|
||||
TensorSplit []float32
|
||||
|
||||
// FlashAttention indicates that we should use a fused flash attention kernel
|
||||
FlashAttention bool
|
||||
@@ -213,28 +141,6 @@ type DeviceMemory struct {
|
||||
Graph Memory
|
||||
}
|
||||
|
||||
// Allocated returns the total size of the memory that has been successfully
|
||||
// allocated on this device
|
||||
func (m DeviceMemory) Allocated() uint64 {
|
||||
var mem uint64
|
||||
|
||||
for _, w := range m.Weights {
|
||||
if w.Status == Allocated {
|
||||
mem += w.Size
|
||||
}
|
||||
}
|
||||
for _, c := range m.Cache {
|
||||
if c.Status == Allocated {
|
||||
mem += c.Size
|
||||
}
|
||||
}
|
||||
if m.Graph.Status == Allocated {
|
||||
mem += m.Graph.Size
|
||||
}
|
||||
|
||||
return mem
|
||||
}
|
||||
|
||||
func memoryPresent(mem []Memory) bool {
|
||||
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
|
||||
}
|
||||
@@ -291,58 +197,6 @@ func (m BackendMemory) LogValue() slog.Value {
|
||||
return slog.GroupValue(attrs...)
|
||||
}
|
||||
|
||||
func sumMemory(mem []Memory) uint64 {
|
||||
var sum uint64
|
||||
|
||||
for _, m := range mem {
|
||||
sum += m.Size
|
||||
}
|
||||
|
||||
return sum
|
||||
}
|
||||
|
||||
// Log prints a high level summary of the memory (allocated or not)
|
||||
func (m BackendMemory) Log(level slog.Level) {
|
||||
var total uint64
|
||||
|
||||
for _, gpu := range m.GPUs {
|
||||
if sum := sumMemory(gpu.Weights); sum > 0 {
|
||||
slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
}
|
||||
if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
|
||||
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
|
||||
for _, gpu := range m.GPUs {
|
||||
if sum := sumMemory(gpu.Cache); sum > 0 {
|
||||
slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
}
|
||||
if sum := sumMemory(m.CPU.Cache); sum > 0 {
|
||||
slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
|
||||
for _, gpu := range m.GPUs {
|
||||
if sum := gpu.Graph.Size; sum > 0 {
|
||||
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
}
|
||||
if sum := m.CPU.Graph.Size; sum > 0 {
|
||||
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
|
||||
if total > 0 {
|
||||
slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
|
||||
}
|
||||
}
|
||||
|
||||
var backends = make(map[string]func(string, BackendParams) (Backend, error))
|
||||
|
||||
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
|
||||
|
||||
@@ -10,7 +10,6 @@ import "C"
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
@@ -63,21 +62,12 @@ var initDevices = sync.OnceFunc(func() {
|
||||
}
|
||||
})
|
||||
|
||||
type layerDevice struct {
|
||||
d C.ggml_backend_dev_t
|
||||
bt C.ggml_backend_buffer_type_t
|
||||
}
|
||||
|
||||
type Backend struct {
|
||||
// modelPath is the location of the model data
|
||||
modelPath string
|
||||
|
||||
meta *fsggml.GGML
|
||||
|
||||
// allocMemory means that memory should be allocated for tensors and not
|
||||
// just a dry run
|
||||
allocMemory bool
|
||||
|
||||
// tensorLoadTargets maps from the name of the tensor in the file
|
||||
// to the name that is used by the model definition
|
||||
tensorLoadTargets map[string][]string
|
||||
@@ -88,14 +78,11 @@ type Backend struct {
|
||||
|
||||
tensors map[string]*C.struct_ggml_tensor
|
||||
|
||||
// input is the backend buffer type used for inputs
|
||||
// input is the backend used for inputs
|
||||
input C.ggml_backend_buffer_type_t
|
||||
|
||||
// output is the backend device used for outputs
|
||||
output C.ggml_backend_dev_t
|
||||
|
||||
// layers is the backend used for repeating layers
|
||||
layers map[int]layerDevice
|
||||
layers map[int]C.ggml_backend_buffer_type_t
|
||||
|
||||
// requiredMemory is the cumulative memory allocations needed by the backend
|
||||
requiredMemory *ml.BackendMemory
|
||||
@@ -112,8 +99,6 @@ type Backend struct {
|
||||
weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
|
||||
}
|
||||
|
||||
var once sync.Once
|
||||
|
||||
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
r, err := os.Open(modelPath)
|
||||
if err != nil {
|
||||
@@ -126,17 +111,15 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
once.Do(func() {
|
||||
slog.Info(
|
||||
"",
|
||||
"architecture", meta.KV().Architecture(),
|
||||
"file_type", meta.KV().FileType(),
|
||||
"name", meta.KV().String("general.name"),
|
||||
"description", meta.KV().String("general.description"),
|
||||
"num_tensors", len(meta.Tensors().Items()),
|
||||
"num_key_values", len(meta.KV()),
|
||||
)
|
||||
})
|
||||
slog.Info(
|
||||
"",
|
||||
"architecture", meta.KV().Architecture(),
|
||||
"file_type", meta.KV().FileType(),
|
||||
"name", meta.KV().String("general.name"),
|
||||
"description", meta.KV().String("general.description"),
|
||||
"num_tensors", len(meta.Tensors().Items()),
|
||||
"num_key_values", len(meta.KV()),
|
||||
)
|
||||
|
||||
initDevices()
|
||||
|
||||
@@ -156,10 +139,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
switch C.ggml_backend_dev_type(d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
bt := C.ggml_backend_dev_buffer_type(d)
|
||||
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
|
||||
C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
|
||||
|
||||
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
|
||||
btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
|
||||
}
|
||||
}
|
||||
@@ -180,8 +160,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
d: d,
|
||||
bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
|
||||
})
|
||||
C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
|
||||
|
||||
btDeviceMemory[bt] = &requiredMemory.GPUs[i]
|
||||
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
|
||||
var props C.struct_ggml_backend_dev_props
|
||||
@@ -191,25 +169,56 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
|
||||
}
|
||||
|
||||
useDefaultSplit := true
|
||||
for _, s := range params.TensorSplit {
|
||||
if s != 0 {
|
||||
useDefaultSplit = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// calculate splits
|
||||
splits := make([]float32, len(gpus))
|
||||
if useDefaultSplit {
|
||||
// default: split on free memory
|
||||
for i := range splits {
|
||||
var free, total C.size_t
|
||||
C.ggml_backend_dev_memory(gpus[i], &free, &total)
|
||||
splits[i] = float32(free)
|
||||
}
|
||||
} else {
|
||||
splits = params.TensorSplit
|
||||
}
|
||||
|
||||
var sum float32
|
||||
// cumulative sum of all splits
|
||||
for i := range splits {
|
||||
sum += splits[i]
|
||||
splits[i] = sum
|
||||
}
|
||||
|
||||
// normalize splits
|
||||
for i := range splits {
|
||||
splits[i] /= sum
|
||||
}
|
||||
|
||||
// inputs always use cpu
|
||||
input := cpuDeviceBufferType
|
||||
|
||||
assignLayer := func(layer int) deviceBufferType {
|
||||
for _, p := range params.GPULayers {
|
||||
for _, l := range p.Layers {
|
||||
if l == layer {
|
||||
for i := range requiredMemory.GPUs {
|
||||
if requiredMemory.GPUs[i].ID == p.ID {
|
||||
return gpuDeviceBufferTypes[i]
|
||||
}
|
||||
}
|
||||
|
||||
return cpuDeviceBufferType
|
||||
}
|
||||
}
|
||||
// define a range of gpu layers. anything outside of this range is assigned to the cpu
|
||||
gpuRangeStart := max(0, blocks-params.NumGPULayers)
|
||||
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
|
||||
assignLayer := func(i int) deviceBufferType {
|
||||
if i < gpuRangeStart || i >= gpuRangeStop {
|
||||
return cpuDeviceBufferType
|
||||
}
|
||||
|
||||
return cpuDeviceBufferType
|
||||
index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
|
||||
if index < 0 || index >= len(gpuDeviceBufferTypes) {
|
||||
return cpuDeviceBufferType
|
||||
}
|
||||
|
||||
return gpuDeviceBufferTypes[index]
|
||||
}
|
||||
|
||||
// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
|
||||
@@ -275,9 +284,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
|
||||
if layer == -1 {
|
||||
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
|
||||
if params.AllocMemory {
|
||||
requiredMemory.InputWeights.Status = ml.Allocated
|
||||
}
|
||||
requiredMemory.InputWeights.Status = ml.Allocated
|
||||
requiredMemory.InputWeights.Size += uint64(size)
|
||||
} else {
|
||||
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
|
||||
@@ -348,14 +355,12 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
}
|
||||
|
||||
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
|
||||
if params.AllocMemory {
|
||||
for i := range btDeviceMemory[bt].Weights {
|
||||
if btDeviceMemory[bt].Weights[i].Size != 0 {
|
||||
if b != nil {
|
||||
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
|
||||
} else {
|
||||
btDeviceMemory[bt].Weights[i].Status = ml.Failed
|
||||
}
|
||||
for i := range btDeviceMemory[bt].Weights {
|
||||
if btDeviceMemory[bt].Weights[i].Size != 0 {
|
||||
if b != nil {
|
||||
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
|
||||
} else {
|
||||
btDeviceMemory[bt].Weights[i].Status = ml.Failed
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -376,9 +381,28 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
bbs[c] = b
|
||||
}
|
||||
|
||||
// Mimic llama runner logs summarizing layers and memory
|
||||
gpuLayers := 0
|
||||
for _, layer := range layers {
|
||||
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
|
||||
gpuLayers++
|
||||
}
|
||||
}
|
||||
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
|
||||
|
||||
switch C.ggml_backend_dev_type(output.d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
slog.Info("offloading output layer to CPU")
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
slog.Info("offloading output layer to GPU")
|
||||
gpuLayers++
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
slog.Info("offloading output layer to ACCEL")
|
||||
}
|
||||
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
|
||||
|
||||
for bs := range maps.Values(bbs) {
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
|
||||
"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
|
||||
slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
|
||||
}
|
||||
|
||||
// map tensor names to tensors for easy lookup later
|
||||
@@ -399,13 +423,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
b := backends[d]
|
||||
bt := C.ggml_backend_get_default_buffer_type(b)
|
||||
|
||||
// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
|
||||
if !slices.Contains(cpuDeviceBufferType.bts, bt) {
|
||||
if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
deviceBufferTypes[d] = bt
|
||||
|
||||
schedBackends = append(schedBackends, b)
|
||||
@@ -420,7 +437,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
|
||||
return &Backend{
|
||||
modelPath: modelPath,
|
||||
allocMemory: params.AllocMemory,
|
||||
flashAttention: params.FlashAttention,
|
||||
meta: meta,
|
||||
tensorLoadTargets: targets,
|
||||
@@ -436,14 +452,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
schedBackends: schedBackends,
|
||||
schedBufts: schedBufts,
|
||||
input: deviceBufferTypes[input.d],
|
||||
output: output.d,
|
||||
layers: func() map[int]layerDevice {
|
||||
m := make(map[int]layerDevice)
|
||||
layers: func() map[int]C.ggml_backend_buffer_type_t {
|
||||
m := make(map[int]C.ggml_backend_buffer_type_t)
|
||||
for i, layer := range layers {
|
||||
m[i] = layerDevice{
|
||||
d: layer.d,
|
||||
bt: deviceBufferTypes[layer.d],
|
||||
}
|
||||
m[i] = deviceBufferTypes[layer.d]
|
||||
}
|
||||
return m
|
||||
}(),
|
||||
@@ -472,30 +484,6 @@ func (b *Backend) Close() {
|
||||
}
|
||||
|
||||
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
||||
if !b.allocMemory {
|
||||
return errors.New("cannot load model without memory allocation")
|
||||
}
|
||||
|
||||
// Mimic llama runner logs summarizing layers and memory
|
||||
gpuLayers := 0
|
||||
for layer := range maps.Values(b.layers) {
|
||||
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
|
||||
gpuLayers++
|
||||
}
|
||||
}
|
||||
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
|
||||
|
||||
switch C.ggml_backend_dev_type(b.output) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
slog.Info("offloading output layer to CPU")
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
slog.Info("offloading output layer to GPU")
|
||||
gpuLayers++
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
slog.Info("offloading output layer to ACCEL")
|
||||
}
|
||||
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
|
||||
|
||||
var doneBytes atomic.Uint64
|
||||
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
|
||||
|
||||
@@ -742,11 +730,11 @@ func (c *Context) Input() ml.Context {
|
||||
}
|
||||
|
||||
func (c *Context) Layer(i int) ml.Context {
|
||||
if layer, ok := c.b.layers[i]; ok {
|
||||
if buft, ok := c.b.layers[i]; ok {
|
||||
return &Context{
|
||||
b: c.b,
|
||||
ctx: c.ctx,
|
||||
buft: layer.bt,
|
||||
buft: buft,
|
||||
allocatedBuffers: c.allocatedBuffers,
|
||||
maxGraphNodes: c.maxGraphNodes,
|
||||
layer: i,
|
||||
@@ -804,16 +792,14 @@ func (c *Context) Reserve() {
|
||||
|
||||
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
|
||||
graph.Size += uint64(bufferStatus.size)
|
||||
if c.b.allocMemory {
|
||||
if bufferStatus.allocated && graph.Status != ml.Failed {
|
||||
graph.Status = ml.Allocated
|
||||
} else {
|
||||
graph.Status = ml.Failed
|
||||
}
|
||||
if bufferStatus.allocated && graph.Status != ml.Failed {
|
||||
graph.Status = ml.Allocated
|
||||
} else {
|
||||
graph.Status = ml.Failed
|
||||
}
|
||||
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
|
||||
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
|
||||
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
|
||||
"size", format.HumanBytes2(uint64(bufferStatus.size)))
|
||||
}
|
||||
|
||||
if !reserved {
|
||||
@@ -882,12 +868,10 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
||||
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
|
||||
|
||||
cache.Size += uint64(size)
|
||||
if c.b.allocMemory {
|
||||
if b != nil {
|
||||
cache.Status = ml.Allocated
|
||||
} else {
|
||||
cache.Status = ml.Failed
|
||||
}
|
||||
if b != nil {
|
||||
cache.Status = ml.Allocated
|
||||
} else {
|
||||
cache.Status = ml.Failed
|
||||
}
|
||||
}
|
||||
|
||||
@@ -906,9 +890,7 @@ func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
|
||||
|
||||
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
||||
t := c.newTensor(dtype, shape)
|
||||
if c.b.allocMemory {
|
||||
C.ggml_set_zero(t.(*Tensor).t)
|
||||
}
|
||||
C.ggml_set_zero(t.(*Tensor).t)
|
||||
return t
|
||||
}
|
||||
|
||||
@@ -933,7 +915,7 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
|
||||
|
||||
t := c.newTensor(ml.DTypeF32, shape)
|
||||
|
||||
if c.b.allocMemory && len(s) > 0 {
|
||||
if len(s) > 0 {
|
||||
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
||||
}
|
||||
|
||||
@@ -945,7 +927,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
|
||||
|
||||
t := c.newTensor(ml.DTypeI32, shape)
|
||||
|
||||
if c.b.allocMemory && len(s) > 0 {
|
||||
if len(s) > 0 {
|
||||
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
||||
}
|
||||
|
||||
@@ -1568,7 +1550,7 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
|
||||
func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
|
||||
// Unchecked to handle quantized types
|
||||
t := c.newTensor(dtype, shape)
|
||||
if c.b.allocMemory && len(s) > 0 {
|
||||
if len(s) > 0 {
|
||||
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
||||
}
|
||||
|
||||
|
||||
12
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
12
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
|
||||
ggml_backend_load_best("blas", silent, dir_path);
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
|
||||
// Avoid mixed hip+cuda configurations
|
||||
const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
|
||||
const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
|
||||
if (!hip_devices && !rocr_devices) {
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
} else {
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
}
|
||||
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
ggml_backend_load_best("sycl", silent, dir_path);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
package arm
|
||||
|
||||
// #cgo CXXFLAGS: -std=c++17
|
||||
// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include -DHWCAP2_SVE2="2"
|
||||
// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include
|
||||
import "C"
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -215,12 +216,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
|
||||
}
|
||||
|
||||
type Server struct {
|
||||
// modelPath is the location of the model to be loaded
|
||||
modelPath string
|
||||
|
||||
// loadMu prevents more than one load attempt from occurring at a time
|
||||
loadMu sync.Mutex
|
||||
|
||||
// is the server ready to process requests?
|
||||
// protects access to model and image
|
||||
ready sync.WaitGroup
|
||||
@@ -728,12 +723,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// loadModel allocates memory based on the given parameters and loads the weights. The
|
||||
// memory allocated is worst case for text models but not for vision.
|
||||
type multiLPath []string
|
||||
|
||||
func (m *multiLPath) Set(value string) error {
|
||||
*m = append(*m, value)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *multiLPath) String() string {
|
||||
return strings.Join(*m, ", ")
|
||||
}
|
||||
|
||||
func (s *Server) loadModel(
|
||||
params llama.ModelParams,
|
||||
mpath string,
|
||||
lpath []string,
|
||||
lpath multiLPath,
|
||||
ppath string,
|
||||
kvSize int,
|
||||
kvCacheType string,
|
||||
@@ -753,10 +757,12 @@ func (s *Server) loadModel(
|
||||
panic(err)
|
||||
}
|
||||
|
||||
for _, path := range lpath {
|
||||
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if lpath.String() != "" {
|
||||
for _, path := range lpath {
|
||||
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -777,81 +783,26 @@ func (s *Server) loadModel(
|
||||
s.ready.Done()
|
||||
}
|
||||
|
||||
// load is the handler called by the Ollama server to process different
|
||||
// load operations
|
||||
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
|
||||
s.loadMu.Lock()
|
||||
defer s.loadMu.Unlock()
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
if s.status != llm.ServerStatusLaunched {
|
||||
http.Error(w, "model already loaded", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
var req llm.LoadRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "bad request", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
slog.Info("load", "request", req)
|
||||
|
||||
switch req.Operation {
|
||||
// LoadOperationFit and LoadOperationAlloc have no meaning here - just return a successful response
|
||||
|
||||
case llm.LoadOperationCommit:
|
||||
s.batchSize = req.BatchSize
|
||||
s.parallel = req.Parallel
|
||||
s.seqs = make([]*Sequence, s.parallel)
|
||||
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
||||
|
||||
gpuIDs := llama.EnumerateGPUs()
|
||||
tensorSplit := make([]float32, len(gpuIDs))
|
||||
numGPU := 0
|
||||
for i := range gpuIDs {
|
||||
for _, layers := range req.GPULayers {
|
||||
if gpuIDs[i] == layers.ID {
|
||||
tensorSplit[i] = float32(len(layers.Layers))
|
||||
numGPU += len(layers.Layers)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
params := llama.ModelParams{
|
||||
NumGpuLayers: numGPU,
|
||||
MainGpu: req.MainGPU,
|
||||
UseMmap: req.UseMmap && len(req.LoraPath) == 0,
|
||||
TensorSplit: tensorSplit,
|
||||
Progress: func(progress float32) {
|
||||
s.progress = progress
|
||||
},
|
||||
}
|
||||
|
||||
s.status = llm.ServerStatusLoadingModel
|
||||
go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
|
||||
|
||||
case llm.LoadOperationClose:
|
||||
// No-op for us
|
||||
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
resp := llm.LoadResponse{Success: true}
|
||||
if err := json.NewEncoder(w).Encode(&resp); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func Execute(args []string) error {
|
||||
fs := flag.NewFlagSet("runner", flag.ExitOnError)
|
||||
mpath := fs.String("model", "", "Path to model binary file")
|
||||
ppath := fs.String("mmproj", "", "Path to projector binary file")
|
||||
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
||||
batchSize := fs.Int("batch-size", 512, "Batch size")
|
||||
nGpuLayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
|
||||
mainGpu := fs.Int("main-gpu", 0, "Main GPU")
|
||||
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
|
||||
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
|
||||
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
|
||||
port := fs.Int("port", 8080, "Port to expose the server on")
|
||||
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||
|
||||
var lpaths multiLPath
|
||||
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
|
||||
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(fs.Output(), "Runner usage\n")
|
||||
@@ -866,11 +817,35 @@ func Execute(args []string) error {
|
||||
llama.BackendInit()
|
||||
|
||||
server := &Server{
|
||||
modelPath: *mpath,
|
||||
status: llm.ServerStatusLaunched,
|
||||
batchSize: *batchSize,
|
||||
parallel: *parallel,
|
||||
seqs: make([]*Sequence, *parallel),
|
||||
seqsSem: semaphore.NewWeighted(int64(*parallel)),
|
||||
status: llm.ServerStatusLoadingModel,
|
||||
}
|
||||
|
||||
var tensorSplitFloats []float32
|
||||
if *tensorSplit != "" {
|
||||
splits := strings.Split(*tensorSplit, ",")
|
||||
tensorSplitFloats = make([]float32, len(splits))
|
||||
for i, s := range splits {
|
||||
f, _ := strconv.ParseFloat(s, 32)
|
||||
tensorSplitFloats[i] = float32(f)
|
||||
}
|
||||
}
|
||||
|
||||
params := llama.ModelParams{
|
||||
NumGpuLayers: *nGpuLayers,
|
||||
MainGpu: *mainGpu,
|
||||
UseMmap: !*noMmap && lpaths.String() == "",
|
||||
TensorSplit: tensorSplitFloats,
|
||||
Progress: func(progress float32) {
|
||||
server.progress = progress
|
||||
},
|
||||
}
|
||||
|
||||
server.ready.Add(1)
|
||||
go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)
|
||||
|
||||
server.cond = sync.NewCond(&server.mu)
|
||||
|
||||
@@ -888,7 +863,6 @@ func Execute(args []string) error {
|
||||
defer listener.Close()
|
||||
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("POST /load", server.load)
|
||||
mux.HandleFunc("/embedding", server.embeddings)
|
||||
mux.HandleFunc("/completion", server.completion)
|
||||
mux.HandleFunc("/health", server.health)
|
||||
|
||||
@@ -14,7 +14,6 @@ import (
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strconv"
|
||||
@@ -260,16 +259,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
|
||||
}
|
||||
|
||||
type Server struct {
|
||||
// modelPath is the location of the model to be loaded
|
||||
modelPath string
|
||||
|
||||
// loadMu prevents more than one load attempt from occurring at a time
|
||||
loadMu sync.Mutex
|
||||
|
||||
// lastLoad is the load request from the previous load attempt. Used to
|
||||
// detect if we can reuse an existing memory allocation.
|
||||
lastLoad llm.LoadRequest
|
||||
|
||||
// is the server ready to process requests?
|
||||
// protects access to model and image
|
||||
ready sync.WaitGroup
|
||||
@@ -731,6 +720,17 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
type multiLPath []string
|
||||
|
||||
func (m *multiLPath) Set(value string) error {
|
||||
*m = append(*m, value)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *multiLPath) String() string {
|
||||
return strings.Join(*m, ", ")
|
||||
}
|
||||
|
||||
func (s *Server) reserveWorstCaseGraph() error {
|
||||
ctx := s.model.Backend().NewContext()
|
||||
defer ctx.Close()
|
||||
@@ -828,28 +828,15 @@ func (s *Server) reserveWorstCaseGraph() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// allocModel pre-allocates the maximum needed memory for a model
|
||||
// based on the given parameters
|
||||
func (s *Server) allocModel(
|
||||
func (s *Server) initModel(
|
||||
mpath string,
|
||||
params ml.BackendParams,
|
||||
loraPath []string,
|
||||
lpath multiLPath,
|
||||
parallel int,
|
||||
kvCacheType string,
|
||||
kvSize int,
|
||||
multiUserCache bool,
|
||||
) (panicErr error) {
|
||||
// Convert memory allocation panics to errors
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
if err, ok := r.(error); ok {
|
||||
panicErr = err
|
||||
} else {
|
||||
panic(r)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
) error {
|
||||
var err error
|
||||
s.model, err = model.New(mpath, params)
|
||||
if err != nil {
|
||||
@@ -857,7 +844,7 @@ func (s *Server) allocModel(
|
||||
}
|
||||
|
||||
// TODO(jessegross): LoRA loading
|
||||
if len(loraPath) > 0 {
|
||||
if lpath.String() != "" {
|
||||
return errors.New("loras are not yet implemented")
|
||||
}
|
||||
|
||||
@@ -878,122 +865,63 @@ func (s *Server) allocModel(
|
||||
return s.reserveWorstCaseGraph()
|
||||
}
|
||||
|
||||
// closeModel frees all memory associated with a model
|
||||
func (s *Server) closeModel() {
|
||||
s.cache.Close()
|
||||
s.cache = nil
|
||||
if s.model != nil {
|
||||
s.model.Backend().Close()
|
||||
s.model = nil
|
||||
}
|
||||
}
|
||||
func (s *Server) load(
|
||||
ctx context.Context,
|
||||
mpath string,
|
||||
params ml.BackendParams,
|
||||
lpath multiLPath,
|
||||
parallel int,
|
||||
kvCacheType string,
|
||||
kvSize int,
|
||||
multiUserCache bool,
|
||||
) {
|
||||
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
|
||||
if err != nil {
|
||||
var noMem ml.ErrNoMem
|
||||
if errors.As(err, &noMem) {
|
||||
// We can't yet handle this but in the future we will
|
||||
s.cache.Close()
|
||||
if s.model != nil {
|
||||
s.model.Backend().Close()
|
||||
}
|
||||
}
|
||||
|
||||
// loadModel loads the weights for a model. The memory must already
|
||||
// have been allocated with allocModel
|
||||
func (s *Server) loadModel() {
|
||||
err := s.model.Backend().Load(context.TODO(),
|
||||
panic(err)
|
||||
}
|
||||
|
||||
slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
|
||||
|
||||
err = s.model.Backend().Load(ctx,
|
||||
func(progress float32) {
|
||||
s.progress = progress
|
||||
})
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("failed to load model: %v", err))
|
||||
panic(err)
|
||||
}
|
||||
|
||||
s.status = llm.ServerStatusReady
|
||||
s.ready.Done()
|
||||
}
|
||||
|
||||
// load is the handler called by the Ollama server to process different
|
||||
// load operations
|
||||
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
|
||||
s.loadMu.Lock()
|
||||
defer s.loadMu.Unlock()
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
if s.status != llm.ServerStatusLaunched {
|
||||
http.Error(w, "model already loaded", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
var req llm.LoadRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "bad request", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
slog.Info("load", "request", req)
|
||||
|
||||
if req.Operation == llm.LoadOperationClose {
|
||||
s.closeModel()
|
||||
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
s.lastLoad.Operation = req.Operation
|
||||
loadModel := s.model == nil || !reflect.DeepEqual(req, s.lastLoad)
|
||||
|
||||
s.lastLoad = req
|
||||
|
||||
if loadModel {
|
||||
s.closeModel()
|
||||
|
||||
params := ml.BackendParams{
|
||||
AllocMemory: req.Operation != llm.LoadOperationFit,
|
||||
NumThreads: req.NumThreads,
|
||||
GPULayers: req.GPULayers,
|
||||
FlashAttention: req.FlashAttention,
|
||||
}
|
||||
|
||||
s.batchSize = req.BatchSize
|
||||
|
||||
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
|
||||
if err != nil {
|
||||
s.closeModel()
|
||||
|
||||
var noMem ml.ErrNoMem
|
||||
if errors.As(err, &noMem) {
|
||||
resp := llm.LoadResponse{Success: false, Memory: noMem.BackendMemory}
|
||||
if err := json.NewEncoder(w).Encode(&resp); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
http.Error(w, fmt.Sprintf("failed to initialize model: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
mem := s.model.Backend().BackendMemory()
|
||||
|
||||
switch req.Operation {
|
||||
case llm.LoadOperationFit:
|
||||
// LoadOperationFit can't be used for anything else, so just close it
|
||||
s.closeModel()
|
||||
|
||||
// LoadOperationAlloc should stay open for future operations
|
||||
|
||||
case llm.LoadOperationCommit:
|
||||
s.status = llm.ServerStatusLoadingModel
|
||||
go s.loadModel()
|
||||
}
|
||||
|
||||
resp := llm.LoadResponse{Success: true, Memory: mem}
|
||||
if err := json.NewEncoder(w).Encode(&resp); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func Execute(args []string) error {
|
||||
fs := flag.NewFlagSet("runner", flag.ExitOnError)
|
||||
mpath := fs.String("model", "", "Path to model binary file")
|
||||
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
||||
batchSize := fs.Int("batch-size", 512, "Batch size")
|
||||
numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
|
||||
mainGPU := fs.Int("main-gpu", 0, "Main GPU")
|
||||
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
|
||||
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
|
||||
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
|
||||
port := fs.Int("port", 8080, "Port to expose the server on")
|
||||
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||
|
||||
var lpaths multiLPath
|
||||
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
|
||||
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(fs.Output(), "Runner usage\n")
|
||||
@@ -1005,17 +933,39 @@ func Execute(args []string) error {
|
||||
slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
|
||||
slog.Info("starting ollama engine")
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
server := &Server{
|
||||
modelPath: *mpath,
|
||||
status: llm.ServerStatusLaunched,
|
||||
batchSize: *batchSize,
|
||||
status: llm.ServerStatusLoadingModel,
|
||||
}
|
||||
|
||||
server.cond = sync.NewCond(&server.mu)
|
||||
server.ready.Add(1)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// TODO(jessegross): Parameters that need to be implemented:
|
||||
// no-mmap
|
||||
|
||||
var tensorSplitFloats []float32
|
||||
if *tensorSplit != "" {
|
||||
splits := strings.Split(*tensorSplit, ",")
|
||||
tensorSplitFloats = make([]float32, len(splits))
|
||||
for i, s := range splits {
|
||||
f, _ := strconv.ParseFloat(s, 32)
|
||||
tensorSplitFloats[i] = float32(f)
|
||||
}
|
||||
}
|
||||
|
||||
params := ml.BackendParams{
|
||||
NumThreads: *threads,
|
||||
NumGPULayers: *numGPULayers,
|
||||
MainGPU: *mainGPU,
|
||||
TensorSplit: tensorSplitFloats,
|
||||
FlashAttention: *flashAttention,
|
||||
}
|
||||
|
||||
go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
|
||||
go server.run(ctx)
|
||||
|
||||
addr := "127.0.0.1:" + strconv.Itoa(*port)
|
||||
@@ -1028,7 +978,6 @@ func Execute(args []string) error {
|
||||
|
||||
mux := http.NewServeMux()
|
||||
// TODO: support embeddings
|
||||
mux.HandleFunc("POST /load", server.load)
|
||||
mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
|
||||
})
|
||||
|
||||
@@ -2,7 +2,6 @@ package server
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"slices"
|
||||
"strings"
|
||||
@@ -276,9 +275,8 @@ const (
|
||||
// HarmonyMessageHandler processes harmony events and accumulates content appropriately.
|
||||
// This is a higher level interface that maps harmony concepts into ollama concepts
|
||||
type HarmonyMessageHandler struct {
|
||||
state harmonyMessageState
|
||||
harmonyParser *HarmonyParser
|
||||
functionNameMap *FunctionNameMap
|
||||
state harmonyMessageState
|
||||
harmonyParser *HarmonyParser
|
||||
}
|
||||
|
||||
// NewHarmonyMessageHandler creates a new message handler
|
||||
@@ -290,7 +288,6 @@ func NewHarmonyMessageHandler() *HarmonyMessageHandler {
|
||||
MessageEndTag: "<|end|>",
|
||||
HeaderEndTag: "<|message|>",
|
||||
},
|
||||
functionNameMap: NewFunctionNameMap(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -381,97 +378,3 @@ func (a *HarmonyToolCallAccumulator) Drain() (*string, string) {
|
||||
func (a *HarmonyToolCallAccumulator) Content() string {
|
||||
return a.acc.String()
|
||||
}
|
||||
|
||||
// FunctionNameMap maps a user-specified function name to a valid function
|
||||
// name for harmony (which look like TypeScript identifiers). This is needed to
|
||||
// transform user-specified function names, which might contain characters that
|
||||
// are not allowed in TypeScript identifiers
|
||||
type FunctionNameMap struct {
|
||||
userToHarmony map[string]string
|
||||
harmonyToUser map[string]string
|
||||
}
|
||||
|
||||
func NewFunctionNameMap() *FunctionNameMap {
|
||||
return &FunctionNameMap{
|
||||
userToHarmony: make(map[string]string),
|
||||
harmonyToUser: make(map[string]string),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *FunctionNameMap) ConvertAndAdd(userFunctionName string) string {
|
||||
harmonyFunctionName := m.deriveName(userFunctionName)
|
||||
m.userToHarmony[userFunctionName] = harmonyFunctionName
|
||||
m.harmonyToUser[harmonyFunctionName] = userFunctionName
|
||||
return harmonyFunctionName
|
||||
}
|
||||
|
||||
// OriginalFromConverted looks up the reverse-mapping of a previously-converted
|
||||
// user->harmony function name. To unmap reliably, the mapping must exist, as
|
||||
// the conversion process is not reversible without the appropriate state
|
||||
func (m *FunctionNameMap) OriginalFromConverted(harmonyFunctionName string) string {
|
||||
if userFunctionName, ok := m.harmonyToUser[harmonyFunctionName]; ok {
|
||||
return userFunctionName
|
||||
}
|
||||
slog.Warn("harmony parser: no reverse mapping found for function name", "harmonyFunctionName", harmonyFunctionName)
|
||||
// fallback to the original function name if we can't find a mapping
|
||||
return harmonyFunctionName
|
||||
}
|
||||
|
||||
// convertToValidChars converts a user-specified function name to a valid
|
||||
// TypeScript identifier.
|
||||
//
|
||||
// Limitations:
|
||||
//
|
||||
// - This doesn't restrict reserved TypeScript keywords.
|
||||
// - We don't perform a real ID_Start/ID_Continue check, and instead use the more
|
||||
// restrictive unicode.IsLetter/unicode.IsDigit check. Unclear what kind of
|
||||
// identifiers these models were trained on, so in the end we might want to
|
||||
// convert unicode-heavy identifiers to their closest ASCII equivalents.
|
||||
func (m *FunctionNameMap) convertToValidChars(userFunctionName string) string {
|
||||
mapper := func(r rune) rune {
|
||||
// first, replace certain characters with underscores
|
||||
if r == ' ' || r == '-' || r == '.' {
|
||||
return '_'
|
||||
}
|
||||
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$' {
|
||||
return r
|
||||
}
|
||||
|
||||
// finally, remove any other characters
|
||||
return -1
|
||||
}
|
||||
candidate := strings.Map(mapper, userFunctionName)
|
||||
|
||||
// set a default name if we end up with nothing left
|
||||
if candidate == "" {
|
||||
return "unnamed"
|
||||
}
|
||||
|
||||
// if the candidate starts with a number, prepend an underscore to make it a
|
||||
// valid identifier
|
||||
if unicode.IsDigit(rune(candidate[0])) {
|
||||
candidate = "_" + candidate
|
||||
}
|
||||
|
||||
return candidate
|
||||
}
|
||||
|
||||
func (m *FunctionNameMap) deriveName(userFunctionName string) string {
|
||||
originalCandidate := m.convertToValidChars(userFunctionName)
|
||||
candidate := originalCandidate
|
||||
|
||||
// Check for dupes, and if so, add a number to the end.
|
||||
// We start at 2 because if we have dupes and the first is never renamed, it
|
||||
// makes sense for them to be named, say, `f`, `f_2`, `f_3`
|
||||
count := 2
|
||||
for {
|
||||
if _, exists := m.harmonyToUser[candidate]; !exists {
|
||||
break
|
||||
}
|
||||
candidate = fmt.Sprintf("%s_%d", originalCandidate, count)
|
||||
count++
|
||||
}
|
||||
|
||||
return candidate
|
||||
}
|
||||
|
||||
@@ -467,71 +467,3 @@ func TestHarmonyParserStreaming(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestFunctionConvertToValidChars tests only FunctionNameMap.convert(), which doesn't
|
||||
// handle any saving (and therefore no dupe handling)
|
||||
func TestFunctionConvertToValidChars(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{name: "replace spaces with underscores", in: "get weather", want: "get_weather"},
|
||||
{name: "replace hyphens with underscores", in: "get-weather", want: "get_weather"},
|
||||
{name: "replace periods with underscores", in: "get.weather", want: "get_weather"},
|
||||
{name: "disallow non-word characters", in: "get weather!", want: "get_weather"},
|
||||
{name: "strip out invalid non-alphanumeric unicode characters", in: "a🫠bc", want: "abc"},
|
||||
{name: "names that only contain invalid characters", in: "🫠", want: "unnamed"},
|
||||
{name: "leading number", in: "123", want: "_123"},
|
||||
{name: "$ allowed", in: "$", want: "$"},
|
||||
// show that we allow weird unicode letter characters, though we might want
|
||||
// to convert them to their closest ASCII equivalents in the future
|
||||
{name: "allow weird unicode letter characters", in: "𝓸𝓵𝓵𝓪𝓶𝓪", want: "𝓸𝓵𝓵𝓪𝓶𝓪"},
|
||||
// names that look like words but are invalid (i.e., not ID_Start/ID_Continue)
|
||||
{name: "disallow non-word characters that look like words", in: "ⓞⓛⓛⓐⓜⓐ123", want: "_123"},
|
||||
}
|
||||
|
||||
for i, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
parser := NewFunctionNameMap()
|
||||
got := parser.convertToValidChars(tt.in)
|
||||
if got != tt.want {
|
||||
t.Errorf("case %d: got %q, want %q", i, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFunctionConvertAndAdd(t *testing.T) {
|
||||
// make a fresh map for each test, but within a test use the same map so we can test for dupe handling
|
||||
tests := []struct {
|
||||
name string
|
||||
in []string
|
||||
want []string
|
||||
}{
|
||||
{name: "basic dupe handling", in: []string{"get weather", "get weather"}, want: []string{"get_weather", "get_weather_2"}},
|
||||
{name: "dupes from different user-specified names", in: []string{"get weather", "get_weather", "get-weather"}, want: []string{"get_weather", "get_weather_2", "get_weather_3"}},
|
||||
{name: "non dupes after dupes", in: []string{"get weather", "get_weather", "get-weather", "something-different"}, want: []string{"get_weather", "get_weather_2", "get_weather_3", "something_different"}},
|
||||
{name: "multiple sets of dupes", in: []string{"a", "a", "b", "a", "a", "b", "a"}, want: []string{"a", "a_2", "b", "a_3", "a_4", "b_2", "a_5"}},
|
||||
}
|
||||
|
||||
for i, tt := range tests {
|
||||
parser := NewFunctionNameMap()
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
for j, in := range tt.in {
|
||||
got := parser.ConvertAndAdd(in)
|
||||
want := tt.want[j]
|
||||
if got != want {
|
||||
t.Errorf("case %d: got %q, want %q", i, got, want)
|
||||
}
|
||||
// check that the maps are correct
|
||||
if parser.userToHarmony[in] != want {
|
||||
t.Errorf("case %d: userToHarmony[%q] = %q, want %q", i, in, parser.userToHarmony[in], want)
|
||||
}
|
||||
if parser.harmonyToUser[want] != in {
|
||||
t.Errorf("case %d: harmonyToUser[%q] = %q, want %q", i, want, parser.harmonyToUser[want], in)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -314,19 +314,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
prompt = b.String()
|
||||
}
|
||||
|
||||
// If debug mode is enabled, return the rendered template instead of calling the model
|
||||
if req.DebugRenderOnly {
|
||||
c.JSON(http.StatusOK, api.DebugTemplateResponse{
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
DebugInfo: api.DebugInfo{
|
||||
RenderedTemplate: prompt,
|
||||
ImageCount: len(images),
|
||||
},
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
var thinkingState *thinking.Parser
|
||||
if !useHarmony {
|
||||
openingTag, closingTag := thinking.InferTags(m.Template.Template)
|
||||
@@ -1490,14 +1477,14 @@ func (s *Server) PsHandler(c *gin.Context) {
|
||||
mr := api.ProcessModelResponse{
|
||||
Model: model.ShortName,
|
||||
Name: model.ShortName,
|
||||
Size: int64(v.totalSize),
|
||||
SizeVRAM: int64(v.vramSize),
|
||||
Size: int64(v.estimatedTotal),
|
||||
SizeVRAM: int64(v.estimatedVRAM),
|
||||
Digest: model.Digest,
|
||||
Details: modelDetails,
|
||||
ExpiresAt: v.expiresAt,
|
||||
}
|
||||
if v.Options != nil {
|
||||
mr.ContextLength = v.Options.NumCtx
|
||||
mr.ContextLength = v.Options.NumCtx / v.numParallel
|
||||
}
|
||||
// The scheduler waits to set expiresAt, so if a model is loading it's
|
||||
// possible that it will be set to the unix epoch. For those cases, just
|
||||
@@ -1603,12 +1590,24 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
msgs = filterThinkTags(msgs, m)
|
||||
|
||||
var harmonyMessageHandler *HarmonyMessageHandler
|
||||
var harmonyToolParser *HarmonyToolCallAccumulator
|
||||
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
|
||||
if err != nil {
|
||||
slog.Error("chat prompt error", "error", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
useHarmony := shouldUseHarmony(*m)
|
||||
|
||||
processedTools := req.Tools
|
||||
// Validate Think value: string values currently only allowed for gptoss models
|
||||
if req.Think != nil && req.Think.IsString() && !useHarmony {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
|
||||
return
|
||||
}
|
||||
|
||||
var harmonyMessageHandler *HarmonyMessageHandler
|
||||
var harmonyToolParser *HarmonyToolCallAccumulator
|
||||
|
||||
if useHarmony {
|
||||
harmonyMessageHandler = NewHarmonyMessageHandler()
|
||||
var lastMessage *api.Message
|
||||
@@ -1617,40 +1616,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
|
||||
harmonyToolParser = harmonyMessageHandler.CreateToolParser()
|
||||
|
||||
// make a copy of tools to pass to the chat prompt. Function names may be
|
||||
// renamed to be valid Harmony function names.
|
||||
processedTools = make([]api.Tool, len(req.Tools))
|
||||
copy(processedTools, req.Tools)
|
||||
for i, tool := range processedTools {
|
||||
processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
|
||||
}
|
||||
}
|
||||
|
||||
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
|
||||
if err != nil {
|
||||
slog.Error("chat prompt error", "error", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// If debug mode is enabled, return the rendered template instead of calling the model
|
||||
if req.DebugRenderOnly {
|
||||
c.JSON(http.StatusOK, api.DebugTemplateResponse{
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
DebugInfo: api.DebugInfo{
|
||||
RenderedTemplate: prompt,
|
||||
ImageCount: len(images),
|
||||
},
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Validate Think value: string values currently only allowed for gptoss models
|
||||
if req.Think != nil && req.Think.IsString() && !useHarmony {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
|
||||
return
|
||||
}
|
||||
|
||||
var thinkingState *thinking.Parser
|
||||
@@ -1705,7 +1670,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
toolName, toolContent := harmonyToolParser.Drain()
|
||||
if toolName != nil {
|
||||
*toolName = strings.TrimPrefix(*toolName, "functions.")
|
||||
*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
|
||||
var args api.ToolCallFunctionArguments
|
||||
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
|
||||
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
|
||||
|
||||
@@ -1,413 +0,0 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
func TestGenerateDebugRenderOnly(t *testing.T) {
|
||||
gin.SetMode(gin.TestMode)
|
||||
|
||||
mock := mockRunner{
|
||||
CompletionResponse: llm.CompletionResponse{
|
||||
Done: true,
|
||||
DoneReason: llm.DoneReasonStop,
|
||||
PromptEvalCount: 1,
|
||||
PromptEvalDuration: 1,
|
||||
EvalCount: 1,
|
||||
EvalDuration: 1,
|
||||
},
|
||||
}
|
||||
|
||||
s := Server{
|
||||
sched: &Scheduler{
|
||||
pendingReqCh: make(chan *LlmRequest, 1),
|
||||
finishedReqCh: make(chan *LlmRequest, 1),
|
||||
expiredCh: make(chan *runnerRef, 1),
|
||||
unloadedCh: make(chan any, 1),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
return false
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
go s.sched.Run(t.Context())
|
||||
|
||||
// Create a test model
|
||||
stream := false
|
||||
_, digest := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.block_count": uint32(1),
|
||||
"llama.context_length": uint32(8192),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
"llama.attention.head_count": uint32(32),
|
||||
"llama.attention.head_count_kv": uint32(8),
|
||||
"tokenizer.ggml.tokens": []string{""},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []*ggml.Tensor{
|
||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
})
|
||||
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Model: "test-model",
|
||||
Files: map[string]string{"file.gguf": digest},
|
||||
Template: "{{ .Prompt }}",
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected status 200, got %d", w.Code)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
request api.GenerateRequest
|
||||
expectDebug bool
|
||||
expectTemplate string
|
||||
expectNumImages int
|
||||
}{
|
||||
{
|
||||
name: "debug render only enabled",
|
||||
request: api.GenerateRequest{
|
||||
Model: "test-model",
|
||||
Prompt: "Hello, world!",
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "Hello, world!",
|
||||
},
|
||||
{
|
||||
name: "debug render only disabled",
|
||||
request: api.GenerateRequest{
|
||||
Model: "test-model",
|
||||
Prompt: "Hello, world!",
|
||||
DebugRenderOnly: false,
|
||||
},
|
||||
expectDebug: false,
|
||||
},
|
||||
{
|
||||
name: "debug render only with system prompt",
|
||||
request: api.GenerateRequest{
|
||||
Model: "test-model",
|
||||
Prompt: "User question",
|
||||
System: "You are a helpful assistant",
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "User question",
|
||||
},
|
||||
{
|
||||
name: "debug render only with template",
|
||||
request: api.GenerateRequest{
|
||||
Model: "test-model",
|
||||
Prompt: "Hello",
|
||||
Template: "PROMPT: {{ .Prompt }}",
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "PROMPT: Hello",
|
||||
},
|
||||
{
|
||||
name: "debug render only with images",
|
||||
request: api.GenerateRequest{
|
||||
Model: "test-model",
|
||||
Prompt: "Describe this image",
|
||||
Images: []api.ImageData{[]byte("fake-image-data")},
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "[img-0]\n\nDescribe this image",
|
||||
expectNumImages: 1,
|
||||
},
|
||||
{
|
||||
name: "debug render only with raw mode",
|
||||
request: api.GenerateRequest{
|
||||
Model: "test-model",
|
||||
Prompt: "Raw prompt text",
|
||||
Raw: true,
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "Raw prompt text",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
// Test both with and without streaming
|
||||
streamValues := []bool{false, true}
|
||||
for _, stream := range streamValues {
|
||||
streamSuffix := ""
|
||||
if stream {
|
||||
streamSuffix = " (streaming)"
|
||||
}
|
||||
t.Run(tt.name+streamSuffix, func(t *testing.T) {
|
||||
req := tt.request
|
||||
req.Stream = &stream
|
||||
w := createRequest(t, s.GenerateHandler, req)
|
||||
|
||||
if tt.expectDebug {
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var response api.DebugTemplateResponse
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
|
||||
t.Fatalf("failed to unmarshal response: %v", err)
|
||||
}
|
||||
|
||||
if response.Model != tt.request.Model {
|
||||
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
|
||||
}
|
||||
|
||||
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
|
||||
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
|
||||
}
|
||||
|
||||
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
|
||||
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
|
||||
}
|
||||
} else {
|
||||
// When debug is disabled, it should attempt normal processing
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChatDebugRenderOnly(t *testing.T) {
|
||||
gin.SetMode(gin.TestMode)
|
||||
|
||||
mock := mockRunner{
|
||||
CompletionResponse: llm.CompletionResponse{
|
||||
Done: true,
|
||||
DoneReason: llm.DoneReasonStop,
|
||||
PromptEvalCount: 1,
|
||||
PromptEvalDuration: 1,
|
||||
EvalCount: 1,
|
||||
EvalDuration: 1,
|
||||
},
|
||||
}
|
||||
|
||||
s := Server{
|
||||
sched: &Scheduler{
|
||||
pendingReqCh: make(chan *LlmRequest, 1),
|
||||
finishedReqCh: make(chan *LlmRequest, 1),
|
||||
expiredCh: make(chan *runnerRef, 1),
|
||||
unloadedCh: make(chan any, 1),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
return false
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
go s.sched.Run(t.Context())
|
||||
|
||||
// Create a test model
|
||||
stream := false
|
||||
_, digest := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.block_count": uint32(1),
|
||||
"llama.context_length": uint32(8192),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
"llama.attention.head_count": uint32(32),
|
||||
"llama.attention.head_count_kv": uint32(8),
|
||||
"tokenizer.ggml.tokens": []string{""},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []*ggml.Tensor{
|
||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
})
|
||||
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Model: "test-model",
|
||||
Files: map[string]string{"file.gguf": digest},
|
||||
Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected status 200, got %d", w.Code)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
request api.ChatRequest
|
||||
expectDebug bool
|
||||
expectTemplate string
|
||||
expectNumImages int
|
||||
}{
|
||||
{
|
||||
name: "chat debug render only enabled",
|
||||
request: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{Role: "system", Content: "You are a helpful assistant"},
|
||||
{Role: "user", Content: "Hello"},
|
||||
},
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "system: You are a helpful assistant\nuser: Hello\n",
|
||||
},
|
||||
{
|
||||
name: "chat debug render only disabled",
|
||||
request: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{Role: "user", Content: "Hello"},
|
||||
},
|
||||
DebugRenderOnly: false,
|
||||
},
|
||||
expectDebug: false,
|
||||
},
|
||||
{
|
||||
name: "chat debug with assistant message",
|
||||
request: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{Role: "user", Content: "Hello"},
|
||||
{Role: "assistant", Content: "Hi there!"},
|
||||
{Role: "user", Content: "How are you?"},
|
||||
},
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "user: Hello\nassistant: Hi there!\nuser: How are you?\n",
|
||||
},
|
||||
{
|
||||
name: "chat debug with images",
|
||||
request: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "What's in this image?",
|
||||
Images: []api.ImageData{[]byte("fake-image-data")},
|
||||
},
|
||||
},
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "user: [img-0]What's in this image?\n",
|
||||
expectNumImages: 1,
|
||||
},
|
||||
{
|
||||
name: "chat debug with tools",
|
||||
request: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{Role: "user", Content: "Get the weather"},
|
||||
},
|
||||
Tools: api.Tools{
|
||||
{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "get_weather",
|
||||
Description: "Get weather information",
|
||||
},
|
||||
},
|
||||
},
|
||||
DebugRenderOnly: true,
|
||||
},
|
||||
expectDebug: true,
|
||||
expectTemplate: "[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information\",\"parameters\":{\"type\":\"\",\"required\":null,\"properties\":null}}}]user: Get the weather\n",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
// Test both with and without streaming
|
||||
streamValues := []bool{false, true}
|
||||
for _, stream := range streamValues {
|
||||
streamSuffix := ""
|
||||
if stream {
|
||||
streamSuffix = " (streaming)"
|
||||
}
|
||||
t.Run(tt.name+streamSuffix, func(t *testing.T) {
|
||||
req := tt.request
|
||||
req.Stream = &stream
|
||||
w := createRequest(t, s.ChatHandler, req)
|
||||
|
||||
if tt.expectDebug {
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var response api.DebugTemplateResponse
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
|
||||
t.Fatalf("failed to unmarshal response: %v", err)
|
||||
}
|
||||
|
||||
if response.Model != tt.request.Model {
|
||||
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
|
||||
}
|
||||
|
||||
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
|
||||
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
|
||||
}
|
||||
|
||||
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
|
||||
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
|
||||
}
|
||||
} else {
|
||||
// When debug is disabled, it should attempt normal processing
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -77,13 +77,12 @@ func TestGenerateChat(t *testing.T) {
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
return false
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -621,13 +620,12 @@ func TestGenerate(t *testing.T) {
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
return false
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -277,11 +277,10 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 100 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
return false
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -428,11 +427,10 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 100 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
return false
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -610,11 +608,10 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
return false
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
365
server/sched.go
365
server/sched.go
@@ -28,6 +28,7 @@ type LlmRequest struct {
|
||||
ctx context.Context //nolint:containedctx
|
||||
model *Model
|
||||
opts api.Options
|
||||
origNumCtx int // Track the initial ctx request
|
||||
sessionDuration *api.Duration
|
||||
successCh chan *runnerRef
|
||||
errCh chan error
|
||||
@@ -40,17 +41,10 @@ type Scheduler struct {
|
||||
expiredCh chan *runnerRef
|
||||
unloadedCh chan any
|
||||
|
||||
// loadedMu protects loaded and activeLoading
|
||||
loaded map[string]*runnerRef
|
||||
loadedMu sync.Mutex
|
||||
|
||||
// activeLoading is the model that we are currently working on loading,
|
||||
// including by evicting one or more other models. We can only load
|
||||
// one model at a time but new requests to models that already loaded can
|
||||
// happen in parallel
|
||||
activeLoading llm.LlamaServer
|
||||
loaded map[string]*runnerRef
|
||||
|
||||
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
|
||||
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
|
||||
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
getGpuFn func() discover.GpuInfoList
|
||||
getCpuFn func() discover.GpuInfoList
|
||||
@@ -62,6 +56,9 @@ type Scheduler struct {
|
||||
// on a large GPU can cause stalling
|
||||
var defaultModelsPerGPU = 3
|
||||
|
||||
// Default automatic value for parallel setting
|
||||
var defaultParallel = 1
|
||||
|
||||
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
|
||||
|
||||
func InitScheduler(ctx context.Context) *Scheduler {
|
||||
@@ -82,36 +79,24 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||
}
|
||||
|
||||
// context must be canceled to decrement ref count and release the runner
|
||||
func (s *Scheduler) GetRunner(c context.Context, m *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
|
||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
if m.CheckCapabilities(model.CapabilityVision) == nil {
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
|
||||
req := &LlmRequest{
|
||||
ctx: c,
|
||||
model: m,
|
||||
model: model,
|
||||
opts: opts,
|
||||
sessionDuration: sessionDuration,
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
successCh: make(chan *runnerRef),
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
|
||||
s.loadedMu.Lock()
|
||||
runner := s.loaded[req.model.ModelPath]
|
||||
s.loadedMu.Unlock()
|
||||
if runner != nil && !runner.needsReload(c, req) {
|
||||
req.useLoadedRunner(runner, s.finishedReqCh)
|
||||
} else {
|
||||
select {
|
||||
case s.pendingReqCh <- req:
|
||||
default:
|
||||
req.errCh <- ErrMaxQueue
|
||||
}
|
||||
select {
|
||||
case s.pendingReqCh <- req:
|
||||
default:
|
||||
req.errCh <- ErrMaxQueue
|
||||
}
|
||||
return req.successCh, req.errCh
|
||||
}
|
||||
@@ -137,11 +122,21 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
case pending := <-s.pendingReqCh:
|
||||
// Block other requests until we get this pending request running
|
||||
pending.schedAttempts++
|
||||
if pending.origNumCtx == 0 {
|
||||
pending.origNumCtx = pending.opts.NumCtx
|
||||
}
|
||||
|
||||
if pending.ctx.Err() != nil {
|
||||
slog.Debug("pending request cancelled or timed out, skipping scheduling")
|
||||
continue
|
||||
}
|
||||
numParallel := int(envconfig.NumParallel())
|
||||
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
|
||||
// ref: https://github.com/ollama/ollama/issues/4165
|
||||
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
|
||||
numParallel = 1
|
||||
slog.Warn("mllama does not currently support parallel requests")
|
||||
}
|
||||
|
||||
for {
|
||||
var runnerToExpire *runnerRef
|
||||
@@ -200,26 +195,84 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
break
|
||||
}
|
||||
|
||||
// Update free memory from currently loaded models
|
||||
s.updateFreeSpace(gpus)
|
||||
// Embedding models should always be loaded with parallel=1
|
||||
if pending.model.CheckCapabilities(model.CapabilityCompletion) != nil {
|
||||
numParallel = 1
|
||||
}
|
||||
|
||||
if loadedCount == 0 {
|
||||
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
// simplifying assumption of defaultParallel when in CPU mode
|
||||
if numParallel <= 0 {
|
||||
numParallel = defaultParallel
|
||||
}
|
||||
|
||||
pending.opts.NumCtx = pending.origNumCtx * numParallel
|
||||
|
||||
if loadedCount == 0 {
|
||||
slog.Debug("cpu mode with first model, loading")
|
||||
s.loadFn(pending, ggml, gpus, numParallel)
|
||||
break
|
||||
}
|
||||
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
|
||||
if runnerToExpire == nil {
|
||||
slog.Debug("cpu mode with available system memory or first model, loading")
|
||||
s.loadFn(pending, ggml, gpus, numParallel)
|
||||
break
|
||||
}
|
||||
// else we need to expire a runner
|
||||
} else if loadedCount == 0 {
|
||||
// No models loaded. Load the model but prefer the best fit.
|
||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||
s.loadFn(pending, ggml, gpus, false)
|
||||
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||
if g != nil {
|
||||
gpus = g
|
||||
} else {
|
||||
// Only allow partial loads when this is the first model
|
||||
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||
}
|
||||
s.loadFn(pending, ggml, gpus, numParallel)
|
||||
break
|
||||
}
|
||||
|
||||
// More than one loaded model, so we have to see if the
|
||||
// new one fits
|
||||
if runnerToExpire == nil {
|
||||
// More than one loaded model, so we have to see if the
|
||||
// new one fits
|
||||
//
|
||||
// We want to avoid loading on any GPUs that have other
|
||||
// models still loading on them to avoid potential races
|
||||
// with VRAM consumption ramping up during load
|
||||
availGpus := s.filterGPUsWithoutLoadingModels(gpus)
|
||||
|
||||
needEvict := s.loadFn(pending, ggml, gpus, true)
|
||||
if !needEvict {
|
||||
slog.Debug("new model fits with existing models, loading")
|
||||
break
|
||||
// Update free memory from currently loaded models
|
||||
s.updateFreeSpace(availGpus)
|
||||
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
|
||||
if fitGpus != nil {
|
||||
slog.Debug("new model fits with existing models, loading")
|
||||
s.loadFn(pending, ggml, fitGpus, numParallel)
|
||||
break
|
||||
}
|
||||
|
||||
// We couldn't find a set of GPUs to fully load the new
|
||||
// model. If no other models are loading (both GPU lists
|
||||
// are the same) then we need to unload another model to
|
||||
// make room
|
||||
if len(availGpus) < len(gpus) {
|
||||
// There are other requests pending, and this one
|
||||
// needs more time, so put it on the back of the
|
||||
// queue so that we might satisfy other pending
|
||||
// requests that aren't blocked
|
||||
go func() {
|
||||
// Process in a go routine to avoid deadlocking
|
||||
// the scheduler if our queue is full
|
||||
slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
|
||||
time.Sleep(s.reschedDelay)
|
||||
s.pendingReqCh <- pending
|
||||
}()
|
||||
break
|
||||
}
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
}
|
||||
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
}
|
||||
|
||||
if runnerToExpire == nil {
|
||||
@@ -240,6 +293,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
}
|
||||
runnerToExpire.refMu.Unlock()
|
||||
// Wait for the unload to happen
|
||||
// Note: at this point we're queueing up all incoming requests, even if they were for
|
||||
// a different model that's loaded and not scheduled to be removed.
|
||||
slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -379,72 +434,26 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||
}()
|
||||
}
|
||||
|
||||
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
||||
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
|
||||
numParallel := int(envconfig.NumParallel())
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||
if numParallel < 1 {
|
||||
numParallel = 1
|
||||
}
|
||||
|
||||
// Embedding models should always be loaded with parallel=1
|
||||
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
|
||||
numParallel = 1
|
||||
}
|
||||
|
||||
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
|
||||
// ref: https://github.com/ollama/ollama/issues/4165
|
||||
if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
|
||||
numParallel = 1
|
||||
slog.Warn("mllama does not currently support parallel requests")
|
||||
}
|
||||
|
||||
sessionDuration := envconfig.KeepAlive()
|
||||
if req.sessionDuration != nil {
|
||||
sessionDuration = req.sessionDuration.Duration
|
||||
}
|
||||
|
||||
s.loadedMu.Lock()
|
||||
llama := s.activeLoading
|
||||
|
||||
if llama == nil {
|
||||
var err error
|
||||
llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
// check for model compatibility
|
||||
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
|
||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
|
||||
}
|
||||
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
|
||||
req.errCh <- err
|
||||
s.loadedMu.Unlock()
|
||||
return false
|
||||
}
|
||||
|
||||
s.activeLoading = llama
|
||||
} else {
|
||||
if s.activeLoading.ModelPath() != req.model.ModelPath {
|
||||
panic(fmt.Errorf("attempting to load different model after eviction (original %v new %v)", s.activeLoading.ModelPath(), req.model.ModelPath))
|
||||
}
|
||||
}
|
||||
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
err := llama.Load(req.ctx, gpus, requireFull)
|
||||
llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
if err != nil {
|
||||
if errors.Is(err, llm.ErrLoadRequiredFull) {
|
||||
return true
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
// check for model compatibility
|
||||
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
|
||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
|
||||
}
|
||||
|
||||
slog.Info("Load failed", "model", req.model.ModelPath, "error", err)
|
||||
s.activeLoading.Close()
|
||||
s.activeLoading = nil
|
||||
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
|
||||
req.errCh <- err
|
||||
return false
|
||||
return
|
||||
}
|
||||
|
||||
runner := &runnerRef{
|
||||
model: req.model,
|
||||
modelPath: req.model.ModelPath,
|
||||
@@ -452,8 +461,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||
Options: &req.opts,
|
||||
sessionDuration: sessionDuration,
|
||||
gpus: gpus,
|
||||
vramSize: llama.VRAMSize(),
|
||||
totalSize: llama.TotalSize(),
|
||||
estimatedVRAM: llama.EstimatedVRAM(),
|
||||
estimatedTotal: llama.EstimatedTotal(),
|
||||
loading: true,
|
||||
pid: llama.Pid(),
|
||||
}
|
||||
@@ -468,7 +477,6 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||
oldRunner.unload()
|
||||
oldRunner.refMu.Unlock()
|
||||
}
|
||||
s.activeLoading = nil
|
||||
s.loaded[req.model.ModelPath] = runner
|
||||
slog.Info("loaded runners", "count", len(s.loaded))
|
||||
s.loadedMu.Unlock()
|
||||
@@ -495,8 +503,6 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||
}()
|
||||
req.successCh <- runner
|
||||
}()
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
|
||||
@@ -515,7 +521,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
|
||||
r.refMu.Lock()
|
||||
if r.llama != nil {
|
||||
for _, gpu := range allGpus {
|
||||
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.VRAMByGPU(gpu.ID)
|
||||
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
|
||||
}
|
||||
} else {
|
||||
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
||||
@@ -542,17 +548,41 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
|
||||
}
|
||||
}
|
||||
|
||||
// While models are loading the VRAM consumption numbers will be indeterminate, so we have
|
||||
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
||||
// This routine returns the set of GPUs that do not have an active loading model.
|
||||
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
||||
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
|
||||
ret := append(discover.GpuInfoList{}, allGpus...)
|
||||
s.loadedMu.Lock()
|
||||
defer s.loadedMu.Unlock()
|
||||
for _, runner := range s.loaded {
|
||||
if runner.loading {
|
||||
slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
|
||||
for _, busyGPU := range runner.gpus {
|
||||
for i := range ret {
|
||||
if ret[i].ID == busyGPU.ID {
|
||||
ret = append(ret[:i], ret[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
// TODO consolidate sched_types.go
|
||||
type runnerRef struct {
|
||||
refMu sync.Mutex
|
||||
refCount uint // prevent unloading if > 0
|
||||
|
||||
llama llm.LlamaServer
|
||||
pid int
|
||||
loading bool // True only during initial load, then false forever
|
||||
gpus discover.GpuInfoList // Recorded at time of provisioning
|
||||
vramSize uint64
|
||||
totalSize uint64
|
||||
llama llm.LlamaServer
|
||||
pid int
|
||||
loading bool // True only during initial load, then false forever
|
||||
gpus discover.GpuInfoList // Recorded at time of provisioning
|
||||
estimatedVRAM uint64
|
||||
estimatedTotal uint64
|
||||
|
||||
sessionDuration time.Duration
|
||||
expireTimer *time.Timer
|
||||
@@ -601,6 +631,9 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
||||
optsNew.NumGPU = -1
|
||||
}
|
||||
|
||||
// Normalize the NumCtx for parallelism
|
||||
optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
|
||||
@@ -661,7 +694,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any {
|
||||
freeMemoryNow += gpu.FreeMemory
|
||||
}
|
||||
// If we're within ~80% of the estimated memory usage recovered, bail out
|
||||
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.8 {
|
||||
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
|
||||
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner)
|
||||
finished <- struct{}{}
|
||||
return
|
||||
@@ -686,8 +719,8 @@ func (runner *runnerRef) LogValue() slog.Value {
|
||||
)
|
||||
}
|
||||
attrs = append(attrs,
|
||||
slog.String("size", format.HumanBytes2(runner.totalSize)),
|
||||
slog.String("vram", format.HumanBytes2(runner.vramSize)),
|
||||
slog.String("size", format.HumanBytes2(runner.estimatedTotal)),
|
||||
slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)),
|
||||
slog.Int("parallel", runner.numParallel),
|
||||
slog.Int("pid", runner.pid),
|
||||
slog.String("model", runner.modelPath),
|
||||
@@ -717,7 +750,95 @@ func (a ByDurationAndName) Less(i, j int) bool {
|
||||
// type BySize []*runnerRef
|
||||
// func (a BySize) Len() int { return len(a) }
|
||||
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
// func (a BySize) Less(i, j int) bool { return a[i].vramSize < a[j].vramSize }
|
||||
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
||||
|
||||
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||
// The list of GPUs returned will always be the same brand (library)
|
||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
|
||||
// opts.NumCtx accordingly
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
var numParallelToTry []int
|
||||
if *numParallel <= 0 {
|
||||
// If no specific parallel setting was provided, try larger then smaller, always end with 1
|
||||
numParallelToTry = append(numParallelToTry, defaultParallel, 1)
|
||||
} else {
|
||||
numParallelToTry = []int{*numParallel}
|
||||
}
|
||||
|
||||
for _, gl := range gpus.ByLibrary() {
|
||||
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
||||
|
||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
||||
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
|
||||
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
||||
|
||||
if !envconfig.SchedSpread() {
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
// Try to pack into as few GPUs as possible, starting from 1 GPU
|
||||
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
|
||||
gpuSubset := sgl[:numGPUs]
|
||||
ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
|
||||
|
||||
if ok {
|
||||
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
|
||||
"model", req.model.ModelPath,
|
||||
"library", sgl[0].Library,
|
||||
"parallel", p,
|
||||
"required", format.HumanBytes2(estimatedVRAM),
|
||||
"gpus", numGPUs)
|
||||
*numParallel = p
|
||||
return gpuSubset
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// TODO future refinements
|
||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading",
|
||||
"model", req.model.ModelPath,
|
||||
"library", sgl[0].Library,
|
||||
"parallel", p,
|
||||
"required", format.HumanBytes2(estimatedVRAM),
|
||||
"gpus", len(sgl))
|
||||
*numParallel = p
|
||||
return sgl
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||
func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
if *numParallel <= 0 {
|
||||
*numParallel = 1
|
||||
req.opts.NumCtx = req.origNumCtx
|
||||
}
|
||||
byLibrary := gpus.ByLibrary()
|
||||
if len(byLibrary) <= 1 {
|
||||
return gpus
|
||||
}
|
||||
var bestEstimate uint64
|
||||
var bestFit int
|
||||
for i, gl := range byLibrary {
|
||||
_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, *numParallel)
|
||||
if estimatedVRAM > bestEstimate {
|
||||
bestEstimate = estimatedVRAM
|
||||
bestFit = i
|
||||
}
|
||||
}
|
||||
return byLibrary[bestFit]
|
||||
}
|
||||
|
||||
// findRunnerToUnload finds a runner to unload to make room for a new model
|
||||
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
||||
@@ -754,13 +875,6 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
||||
func (s *Scheduler) unloadAllRunners() {
|
||||
s.loadedMu.Lock()
|
||||
defer s.loadedMu.Unlock()
|
||||
|
||||
if s.activeLoading != nil {
|
||||
slog.Debug("shutting down currently loading runner")
|
||||
s.activeLoading.Close()
|
||||
s.activeLoading = nil
|
||||
}
|
||||
|
||||
for model, runner := range s.loaded {
|
||||
if runner.llama != nil {
|
||||
slog.Debug("shutting down runner", "model", model)
|
||||
@@ -787,3 +901,18 @@ func (s *Scheduler) expireRunner(model *Model) {
|
||||
runner.refMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
|
||||
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
||||
estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts, req.opts.NumCtx/req.origNumCtx)
|
||||
if estimate.TotalSize <= gpus[0].FreeMemory {
|
||||
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
|
||||
|
||||
return s.findRunnerToUnload()
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ func TestLoad(t *testing.T) {
|
||||
return nil, errors.New("something failed to load model blah")
|
||||
}
|
||||
gpus := discover.GpuInfoList{}
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, gpus, 0)
|
||||
require.Empty(t, req.successCh)
|
||||
require.Len(t, req.errCh, 1)
|
||||
s.loadedMu.Lock()
|
||||
@@ -61,17 +61,16 @@ func TestLoad(t *testing.T) {
|
||||
err := <-req.errCh
|
||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, gpus, 0)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.NoError(t, err)
|
||||
case resp := <-req.successCh:
|
||||
require.Equal(t, uint64(10), resp.vramSize)
|
||||
require.Equal(t, uint64(10), resp.estimatedVRAM)
|
||||
require.Equal(t, uint(1), resp.refCount)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
@@ -80,7 +79,7 @@ func TestLoad(t *testing.T) {
|
||||
|
||||
req.model.ModelPath = "dummy_model_path"
|
||||
server.waitResp = errors.New("wait failure")
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, gpus, 0)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.Contains(t, err.Error(), "wait failure")
|
||||
@@ -105,11 +104,10 @@ type reqBundle struct {
|
||||
}
|
||||
|
||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
scenario.srv.modelPath = model
|
||||
return scenario.srv, nil
|
||||
}
|
||||
|
||||
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration) *reqBundle {
|
||||
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
|
||||
b := &reqBundle{}
|
||||
b.ctx, b.ctxDone = context.WithCancel(ctx)
|
||||
t.Helper()
|
||||
@@ -146,7 +144,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
b.srv = &mockLlm{vramSize: vramSize, vramByGPU: map[string]uint64{"": vramSize}}
|
||||
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
|
||||
return b
|
||||
}
|
||||
|
||||
@@ -264,10 +262,10 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
|
||||
|
||||
// Multiple loaded models
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-3b", 10*format.GigaByte, nil)
|
||||
c := newScenarioRequest(t, ctx, "ollama-model-4a", 10*format.GigaByte, nil)
|
||||
c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
||||
d := newScenarioRequest(t, ctx, "ollama-model-3c", 10*format.GigaByte, nil) // Needs prior unloaded
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
|
||||
c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
|
||||
c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
||||
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
|
||||
|
||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
|
||||
s.newServerFn = a.newServer
|
||||
@@ -420,12 +418,11 @@ func TestExpireRunner(t *testing.T) {
|
||||
|
||||
var f *ggml.GGML
|
||||
gpus := discover.GpuInfoList{}
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, gpus, 0)
|
||||
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
@@ -509,7 +506,7 @@ func TestUseLoadedRunner(t *testing.T) {
|
||||
sessionDuration: &api.Duration{Duration: 2},
|
||||
}
|
||||
finished := make(chan *LlmRequest)
|
||||
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
|
||||
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||
r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
|
||||
req.useLoadedRunner(r1, finished)
|
||||
require.Equal(t, uint(1), r1.refCount)
|
||||
@@ -544,8 +541,8 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||
gpus[0].FreeMemory = 900
|
||||
gpus[1].TotalMemory = 2000
|
||||
gpus[1].FreeMemory = 1900
|
||||
llm1 := &mockLlm{vramByGPU: map[string]uint64{"1": 50, "2": 50}}
|
||||
llm2 := &mockLlm{vramByGPU: map[string]uint64{"1": 125, "2": 75}}
|
||||
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
|
||||
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
|
||||
r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
|
||||
r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
|
||||
|
||||
@@ -560,6 +557,40 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
|
||||
}
|
||||
|
||||
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
gpus := discover.GpuInfoList{
|
||||
{
|
||||
Library: "cuda",
|
||||
ID: "0",
|
||||
},
|
||||
{
|
||||
Library: "cuda",
|
||||
ID: "1",
|
||||
},
|
||||
}
|
||||
r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
tmp := s.filterGPUsWithoutLoadingModels(gpus)
|
||||
require.Len(t, tmp, 1)
|
||||
require.Equal(t, "1", tmp[0].ID)
|
||||
|
||||
r1.gpus = discover.GpuInfoList{gpus[1]}
|
||||
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||
require.Len(t, tmp, 1)
|
||||
require.Equal(t, "0", tmp[0].ID)
|
||||
|
||||
r1.gpus = discover.GpuInfoList{}
|
||||
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||
require.Len(t, tmp, 2)
|
||||
}
|
||||
|
||||
func TestFindRunnerToUnload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
@@ -584,7 +615,7 @@ func TestNeedsReload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
llm := &mockLlm{vramByGPU: map[string]uint64{}}
|
||||
llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||
do := api.DefaultOptions()
|
||||
runner := &runnerRef{
|
||||
model: &Model{
|
||||
@@ -631,8 +662,8 @@ func TestUnloadAllRunners(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
|
||||
llm2 := &mockLlm{vramByGPU: map[string]uint64{}}
|
||||
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s := InitScheduler(ctx)
|
||||
s.unloadAllRunners()
|
||||
|
||||
@@ -650,7 +681,7 @@ func TestUnloadAllRunners(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestUnload(t *testing.T) {
|
||||
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
|
||||
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||
r1 := &runnerRef{llama: llm1, numParallel: 1}
|
||||
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
|
||||
r1.unload()
|
||||
@@ -676,40 +707,62 @@ func TestAlreadyCanceled(t *testing.T) {
|
||||
require.Empty(t, scenario1a.req.successCh)
|
||||
}
|
||||
|
||||
type mockLlm struct {
|
||||
modelPath string
|
||||
pingResp error
|
||||
waitResp error
|
||||
completionResp error
|
||||
embeddingResp []float32
|
||||
embeddingRespErr error
|
||||
tokenizeResp []int
|
||||
tokenizeRespErr error
|
||||
detokenizeResp string
|
||||
detonekizeRespErr error
|
||||
closeResp error
|
||||
closeCalled bool
|
||||
vramSize uint64
|
||||
totalSize uint64
|
||||
vramByGPU map[string]uint64
|
||||
}
|
||||
func TestHomogeneousGPUs(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
|
||||
func (s *mockLlm) ModelPath() string {
|
||||
return s.modelPath
|
||||
}
|
||||
|
||||
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
|
||||
if requireFull {
|
||||
for _, g := range gpus {
|
||||
if g.FreeMemory >= s.vramSize {
|
||||
return nil
|
||||
}
|
||||
s.getGpuFn = func() discover.GpuInfoList {
|
||||
// Set memory values to require the model to be spread
|
||||
gpus := []discover.GpuInfo{
|
||||
{Library: "cuda"},
|
||||
{Library: "rocm"},
|
||||
}
|
||||
|
||||
return llm.ErrLoadRequiredFull
|
||||
gpus[0].TotalMemory = 1 * format.GibiByte
|
||||
gpus[0].FreeMemory = 256 * format.MebiByte
|
||||
gpus[1].TotalMemory = 1 * format.GibiByte
|
||||
gpus[1].FreeMemory = 256 * format.MebiByte
|
||||
return gpus
|
||||
}
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
require.Len(t, gpus, 1)
|
||||
return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
|
||||
}
|
||||
slog.Info("a")
|
||||
s.pendingReqCh <- a.req
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
s.Run(ctx)
|
||||
select {
|
||||
case resp := <-a.req.successCh:
|
||||
require.Equal(t, resp.llama, a.srv)
|
||||
require.Empty(t, s.pendingReqCh)
|
||||
require.Empty(t, a.req.errCh)
|
||||
case err := <-a.req.errCh:
|
||||
t.Fatal(err.Error())
|
||||
case <-ctx.Done():
|
||||
t.Fatal("timeout")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type mockLlm struct {
|
||||
pingResp error
|
||||
waitResp error
|
||||
completionResp error
|
||||
embeddingResp []float32
|
||||
embeddingRespErr error
|
||||
tokenizeResp []int
|
||||
tokenizeRespErr error
|
||||
detokenizeResp string
|
||||
detonekizeRespErr error
|
||||
closeResp error
|
||||
closeCalled bool
|
||||
estimatedVRAM uint64
|
||||
estimatedTotal uint64
|
||||
estimatedVRAMByGPU map[string]uint64
|
||||
}
|
||||
|
||||
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
||||
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
|
||||
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
||||
@@ -732,7 +785,7 @@ func (s *mockLlm) Close() error {
|
||||
s.closeCalled = true
|
||||
return s.closeResp
|
||||
}
|
||||
func (s *mockLlm) VRAMSize() uint64 { return s.vramSize }
|
||||
func (s *mockLlm) TotalSize() uint64 { return s.totalSize }
|
||||
func (s *mockLlm) VRAMByGPU(gpuid string) uint64 { return s.vramByGPU[gpuid] }
|
||||
func (s *mockLlm) Pid() int { return -1 }
|
||||
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
||||
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
||||
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
|
||||
func (s *mockLlm) Pid() int { return -1 }
|
||||
|
||||
Reference in New Issue
Block a user