Compare commits

..

10 Commits

Author SHA1 Message Date
Blake Mizerany
acbffa59e9 llm: suppress large allocations for GGUF arrays
This introduces a little array type for holding GGUF arrays that
prevents the array from growing too large. It preserves the total size
of the array, but limits the number of elements that are actually
allocated.

GGUF arrays that are extremely large, such as tokens, etc, are generally
uninteresting to users, and are not worth the memory overhead, and the
time spent allocating and freeing them. They are necessary for
inference, but not for inspection.

The size of these arrays is, however, important in Ollama, so it is
preserved in a separate field on array.
2024-06-23 14:26:56 -07:00
Daniel Hiltgen
ccef9431c8 Merge pull request #5205 from dhiltgen/modelfile_use_mmap
Fix use_mmap parsing for modelfiles
2024-06-21 16:30:36 -07:00
royjhan
9a9e7d83c4 Docs (#5149) 2024-06-21 15:52:09 -07:00
Michael Yang
189a43caa2 Merge pull request #5206 from ollama/mxyng/quantize
fix: quantization with template
2024-06-21 13:44:34 -07:00
Michael Yang
e835ef1836 fix: quantization with template 2024-06-21 13:39:25 -07:00
Daniel Hiltgen
7e7749224c Fix use_mmap parsing for modelfiles
Add the new tristate parsing logic for the code path for modelfiles,
as well as a unit test.
2024-06-21 12:27:19 -07:00
Daniel Hiltgen
c7c2f3bc22 Merge pull request #5194 from dhiltgen/linux_mmap_auto
Refine mmap default logic on linux
2024-06-20 11:44:08 -07:00
Daniel Hiltgen
54a79d6a8a Merge pull request #5125 from dhiltgen/fedora39
Bump latest fedora cuda repo to 39
2024-06-20 11:27:24 -07:00
Daniel Hiltgen
5bf5aeec01 Refine mmap default logic on linux
If we try to use mmap when the model is larger than the system free space, loading is slower than the no-mmap approach.
2024-06-20 11:07:04 -07:00
Daniel Hiltgen
1a1c99e334 Bump latest fedora cuda repo to 39 2024-06-18 17:13:54 -07:00
8 changed files with 140 additions and 25 deletions

View File

@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
```
### Show model information
```
ollama show llama3
```
### List models on your computer
```

View File

@@ -608,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
} else {
field := valueOpts.FieldByName(opt.Name)
if field.IsValid() && field.CanSet() {
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
boolVal, err := strconv.ParseBool(vals[0])
if err != nil {
return nil, fmt.Errorf("invalid bool value %s", vals)
}
if boolVal {
out[key] = TriStateTrue
} else {
out[key] = TriStateFalse
}
continue
}
switch field.Kind() {
case reflect.Float32:
floatVal, err := strconv.ParseFloat(vals[0], 32)

View File

@@ -2,6 +2,7 @@ package api
import (
"encoding/json"
"fmt"
"math"
"testing"
"time"
@@ -141,3 +142,65 @@ func TestUseMmapParsingFromJSON(t *testing.T) {
})
}
}
func TestUseMmapFormatParams(t *testing.T) {
tests := []struct {
name string
req map[string][]string
exp TriState
err error
}{
{
name: "True",
req: map[string][]string{
"use_mmap": []string{"true"},
},
exp: TriStateTrue,
err: nil,
},
{
name: "False",
req: map[string][]string{
"use_mmap": []string{"false"},
},
exp: TriStateFalse,
err: nil,
},
{
name: "Numeric True",
req: map[string][]string{
"use_mmap": []string{"1"},
},
exp: TriStateTrue,
err: nil,
},
{
name: "Numeric False",
req: map[string][]string{
"use_mmap": []string{"0"},
},
exp: TriStateFalse,
err: nil,
},
{
name: "invalid string",
req: map[string][]string{
"use_mmap": []string{"foo"},
},
exp: TriStateUndefined,
err: fmt.Errorf("invalid bool value [foo]"),
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
resp, err := FormatParams(test.req)
require.Equal(t, err, test.err)
respVal, ok := resp["use_mmap"]
if test.exp != TriStateUndefined {
assert.True(t, ok, "resp: %v", resp)
assert.Equal(t, test.exp, respVal)
}
})
}
}

View File

@@ -321,7 +321,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
embedding := llm.KV().EmbeddingLength()
heads := llm.KV().HeadCount()
headsKV := llm.KV().HeadCountKV()
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
vocab := llm.KV()["tokenizer.ggml.tokens"].(*array).size
embeddingHeads := llm.KV().EmbeddingHeadCount()
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()

View File

@@ -316,7 +316,7 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
return err
}
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
t, err := readGGUF[uint32](llm, r)
if err != nil {
return nil, err
@@ -327,6 +327,8 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
a := &array{size: uint64(n)}
for i := 0; uint32(i) < n; i++ {
var e any
switch t {
@@ -361,13 +363,27 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
a = append(a, e)
if len(a.values) < arrayMaxSize {
a.values = append(a.values, e)
}
}
return
return a, nil
}
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
const arrayMaxSize = 1000
type array struct {
size uint64
// values is the slice of values in the array.
//
// Its length may be less than size if the array is too big to reaonably
// fit in memory. The current limit si arrayMaxSize.
values []any
}
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
if llm.Version == 1 {
return readGGUFV1Array(llm, r)
}
@@ -382,6 +398,8 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
a := &array{size: n}
for i := 0; uint64(i) < n; i++ {
var e any
switch t {
@@ -416,10 +434,16 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
a = append(a, e)
// TODO(bmizerany): We may want to only enforce this limit
// on certain fields, however, as of now, I (bmizerany) do
// not know of any array fields that are needed by Ollama that
// exceed this limit.
if len(a.values) < arrayMaxSize {
a.values = append(a.values, e)
}
}
return
return a, nil
}
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {

View File

@@ -81,7 +81,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var err error
var cpuRunner string
var estimate MemoryEstimate
var systemMemory uint64
var systemTotalMemory uint64
var systemFreeMemory uint64
systemMemInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
systemTotalMemory = systemMemInfo.TotalMemory
systemFreeMemory = systemMemInfo.FreeMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
}
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 {
@@ -91,19 +101,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner = serverForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else {
if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
systemMemory = memInfo.TotalMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
}
}
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
opts.NumGPU = 0
@@ -211,7 +212,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
// Windows CUDA should not use mmap for best performance
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
// Linux with a model larger than free space, mmap leads to thrashing
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
opts.UseMMap == api.TriStateFalse {
params = append(params, "--no-mmap")
}

View File

@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
case $OS_NAME in
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
amzn) install_cuda_driver_yum 'fedora' '37' ;;
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;

View File

@@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err
}
layers, err := parseFromFile(ctx, temp, "", fn)
layer, err := NewLayer(temp, baseLayer.MediaType)
if err != nil {
return err
}
if len(layers) != 1 {
return errors.New("quantization failed")
if _, err := temp.Seek(0, io.SeekStart); err != nil {
return err
}
baseLayer.Layer = layers[0].Layer
baseLayer.GGML = layers[0].GGML
ggml, _, err := llm.DecodeGGML(temp)
if err != nil {
return err
}
baseLayer.Layer = layer
baseLayer.GGML = ggml
}
}