Compare commits

..

21 Commits

Author SHA1 Message Date
Roy Han
d77a174eb4 defaut timeout 2024-06-27 14:58:31 -07:00
Michael
2cc7d05012 update readme for gemma 2 (#5333)
* update readme for gemma 2
2024-06-27 12:45:16 -04:00
Michael Yang
123a722a6f zip: prevent extracting files into parent dirs (#5314) 2024-06-26 21:38:21 -07:00
Jeffrey Morgan
4d311eb731 llm: architecture patch (#5316) 2024-06-26 21:38:12 -07:00
Blake Mizerany
cb42e607c5 llm: speed up gguf decoding by a lot (#5246)
Previously, some costly things were causing the loading of GGUF files
and their metadata and tensor information to be VERY slow:

  * Too many allocations when decoding strings
  * Hitting disk for each read of each key and value, resulting in a
    not-okay amount of syscalls/disk I/O.

The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro
m3.

This commit also prevents collecting large arrays of values when
decoding GGUFs (if desired). When such keys are encountered, their
values are null, and are encoded as such in JSON.

Also, this fixes a broken test that was not encoding valid GGUF.
2024-06-24 21:47:52 -07:00
Blake Mizerany
2aa91a937b cmd: defer stating model info until necessary (#5248)
This commit changes the 'ollama run' command to defer fetching model
information until it really needs it. That is, when in interactive mode.

It also removes one such case where the model information is fetch in
duplicate, just before calling generateInteractive and then again, first
thing, in generateInteractive.

This positively impacts the performance of the command:

    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.168 total
    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.220 total
    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.217 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 4% cpu 0.652 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.01s user 0.01s system 5% cpu 0.498 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with or would you like to chat?

    ./after run llama3 'hi'  0.01s user 0.01s system 3% cpu 0.479 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 5% cpu 0.507 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 5% cpu 0.507 total
2024-06-24 20:14:03 -07:00
Daniel Hiltgen
ccef9431c8 Merge pull request #5205 from dhiltgen/modelfile_use_mmap
Fix use_mmap parsing for modelfiles
2024-06-21 16:30:36 -07:00
royjhan
9a9e7d83c4 Docs (#5149) 2024-06-21 15:52:09 -07:00
Michael Yang
189a43caa2 Merge pull request #5206 from ollama/mxyng/quantize
fix: quantization with template
2024-06-21 13:44:34 -07:00
Michael Yang
e835ef1836 fix: quantization with template 2024-06-21 13:39:25 -07:00
Daniel Hiltgen
7e7749224c Fix use_mmap parsing for modelfiles
Add the new tristate parsing logic for the code path for modelfiles,
as well as a unit test.
2024-06-21 12:27:19 -07:00
Daniel Hiltgen
c7c2f3bc22 Merge pull request #5194 from dhiltgen/linux_mmap_auto
Refine mmap default logic on linux
2024-06-20 11:44:08 -07:00
Daniel Hiltgen
54a79d6a8a Merge pull request #5125 from dhiltgen/fedora39
Bump latest fedora cuda repo to 39
2024-06-20 11:27:24 -07:00
Daniel Hiltgen
5bf5aeec01 Refine mmap default logic on linux
If we try to use mmap when the model is larger than the system free space, loading is slower than the no-mmap approach.
2024-06-20 11:07:04 -07:00
Michael Yang
e01e535cbb Merge pull request #5192 from ollama/mxyng/kv
handle asymmetric embedding KVs
2024-06-20 10:46:24 -07:00
Josh
0195d6a2f8 Merge pull request #5188 from ollama/jyan/tmpdir2
fix: skip os.removeAll() if PID does not exist
2024-06-20 10:40:59 -07:00
Michael Yang
8e0641a9bf handle asymmetric embedding KVs 2024-06-20 09:57:27 -07:00
Josh Yan
662568d453 err!=nil check 2024-06-20 09:30:59 -07:00
Josh Yan
4ebb66c662 reformat error check 2024-06-20 09:23:43 -07:00
Josh Yan
23e899f32d skip os.removeAll() if PID does not exist 2024-06-20 08:51:35 -07:00
Daniel Hiltgen
1a1c99e334 Bump latest fedora cuda repo to 39 2024-06-18 17:13:54 -07:00
23 changed files with 914 additions and 202 deletions

View File

@@ -53,8 +53,8 @@ Here are some example models that can be downloaded:
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` | | Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` | | Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` | | Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` | | Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` | | Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
| Mistral | 7B | 4.1GB | `ollama run mistral` | | Mistral | 7B | 4.1GB | `ollama run mistral` |
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` | | Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` | | Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications. Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
``` ```
### Show model information
```
ollama show llama3
```
### List models on your computer ### List models on your computer
``` ```

View File

@@ -608,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
} else { } else {
field := valueOpts.FieldByName(opt.Name) field := valueOpts.FieldByName(opt.Name)
if field.IsValid() && field.CanSet() { if field.IsValid() && field.CanSet() {
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
boolVal, err := strconv.ParseBool(vals[0])
if err != nil {
return nil, fmt.Errorf("invalid bool value %s", vals)
}
if boolVal {
out[key] = TriStateTrue
} else {
out[key] = TriStateFalse
}
continue
}
switch field.Kind() { switch field.Kind() {
case reflect.Float32: case reflect.Float32:
floatVal, err := strconv.ParseFloat(vals[0], 32) floatVal, err := strconv.ParseFloat(vals[0], 32)

View File

@@ -2,6 +2,7 @@ package api
import ( import (
"encoding/json" "encoding/json"
"fmt"
"math" "math"
"testing" "testing"
"time" "time"
@@ -141,3 +142,65 @@ func TestUseMmapParsingFromJSON(t *testing.T) {
}) })
} }
} }
func TestUseMmapFormatParams(t *testing.T) {
tests := []struct {
name string
req map[string][]string
exp TriState
err error
}{
{
name: "True",
req: map[string][]string{
"use_mmap": []string{"true"},
},
exp: TriStateTrue,
err: nil,
},
{
name: "False",
req: map[string][]string{
"use_mmap": []string{"false"},
},
exp: TriStateFalse,
err: nil,
},
{
name: "Numeric True",
req: map[string][]string{
"use_mmap": []string{"1"},
},
exp: TriStateTrue,
err: nil,
},
{
name: "Numeric False",
req: map[string][]string{
"use_mmap": []string{"0"},
},
exp: TriStateFalse,
err: nil,
},
{
name: "invalid string",
req: map[string][]string{
"use_mmap": []string{"foo"},
},
exp: TriStateUndefined,
err: fmt.Errorf("invalid bool value [foo]"),
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
resp, err := FormatParams(test.req)
require.Equal(t, err, test.err)
respVal, ok := resp["use_mmap"]
if test.exp != TriStateUndefined {
assert.True(t, ok, "resp: %v", resp)
assert.Equal(t, test.exp, respVal)
}
})
}
}

View File

@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
} }
defer tempfile.Close() defer tempfile.Close()
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
detectContentType := func(path string) (string, error) { detectContentType := func(path string) (string, error) {
f, err := os.Open(path) f, err := os.Open(path)
if err != nil { if err != nil {
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
files = append(files, tks...) files = append(files, tks...)
} }
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
for _, file := range files { for _, file := range files {
f, err := os.Open(file) f, err := os.Open(file)
if err != nil { if err != nil {
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
} }
func RunHandler(cmd *cobra.Command, args []string) error { func RunHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
name := args[0]
// check if the model exists on the server
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
var statusError api.StatusError
switch {
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
if err := PullHandler(cmd, []string{name}); err != nil {
return err
}
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
if err != nil {
return err
}
case err != nil:
return err
}
interactive := true interactive := true
opts := runOptions{ opts := runOptions{
Model: args[0], Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color", WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{}, Options: map[string]interface{}{},
MultiModal: slices.Contains(show.Details.Families, "clip"),
ParentModel: show.Details.ParentModel,
} }
format, err := cmd.Flags().GetString("format") format, err := cmd.Flags().GetString("format")
@@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error {
} }
opts.WordWrap = !nowrap opts.WordWrap = !nowrap
if !interactive { // Fill out the rest of the options based on information about the
return generate(cmd, opts) // model.
client, err := api.ClientFromEnvironment()
if err != nil {
return err
} }
return generateInteractive(cmd, opts) name := args[0]
info, err := func() (*api.ShowResponse, error) {
showReq := &api.ShowRequest{Name: name}
info, err := client.Show(cmd.Context(), showReq)
var se api.StatusError
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
if err := PullHandler(cmd, []string{name}); err != nil {
return nil, err
}
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
}
return info, err
}()
if err != nil {
return err
}
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
opts.ParentModel = info.Details.ParentModel
opts.Messages = append(opts.Messages, info.Messages...)
if interactive {
return generateInteractive(cmd, opts)
}
return generate(cmd, opts)
} }
func errFromUnknownKey(unknownKeyErr error) error { func errFromUnknownKey(unknownKeyErr error) error {

View File

@@ -31,65 +31,40 @@ const (
) )
func loadModel(cmd *cobra.Command, opts *runOptions) error { func loadModel(cmd *cobra.Command, opts *runOptions) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
p := progress.NewProgress(os.Stderr) p := progress.NewProgress(os.Stderr)
defer p.StopAndClear() defer p.StopAndClear()
spinner := progress.NewSpinner("") spinner := progress.NewSpinner("")
p.Add("", spinner) p.Add("", spinner)
showReq := api.ShowRequest{Name: opts.Model} client, err := api.ClientFromEnvironment()
showResp, err := client.Show(cmd.Context(), &showReq)
if err != nil { if err != nil {
return err return err
} }
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
opts.ParentModel = showResp.Details.ParentModel
if len(showResp.Messages) > 0 {
opts.Messages = append(opts.Messages, showResp.Messages...)
}
chatReq := &api.ChatRequest{ chatReq := &api.ChatRequest{
Model: opts.Model, Model: opts.Model,
Messages: []api.Message{}, KeepAlive: opts.KeepAlive,
} }
if opts.KeepAlive != nil { return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
chatReq.KeepAlive = opts.KeepAlive
}
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear() p.StopAndClear()
if len(opts.Messages) > 0 { for _, msg := range opts.Messages {
for _, msg := range opts.Messages { switch msg.Role {
switch msg.Role { case "user":
case "user": fmt.Printf(">>> %s\n", msg.Content)
fmt.Printf(">>> %s\n", msg.Content) case "assistant":
case "assistant": state := &displayResponseState{}
state := &displayResponseState{} displayResponse(msg.Content, opts.WordWrap, state)
displayResponse(msg.Content, opts.WordWrap, state) fmt.Println()
fmt.Println() fmt.Println()
fmt.Println()
}
} }
} }
return nil return nil
}) })
if err != nil {
return err
}
return nil
} }
func generateInteractive(cmd *cobra.Command, opts runOptions) error { func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.Messages = make([]api.Message, 0)
err := loadModel(cmd, &opts) err := loadModel(cmd, &opts)
if err != nil { if err != nil {
return err return err

View File

@@ -77,20 +77,27 @@ func cleanupTmpDirs() {
continue continue
} }
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid")) raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
if err == nil {
pid, err := strconv.Atoi(string(raw))
if err == nil {
if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
// Another running ollama, ignore this tmpdir
continue
}
}
} else {
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
}
err = os.RemoveAll(d)
if err != nil { if err != nil {
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err) slog.Warn("failed to read ollama.pid", "path", d, "error", err)
// No pid, ignore this tmpdir
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("failed to parse pid", "path", d, "error", err)
continue
}
proc, err := os.FindProcess(pid)
if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("found running ollama", "pid", pid, "path", d)
// Another running ollama, ignore this tmpdir
continue
}
if err := os.Remove(d); err != nil {
slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
} }
} }
} }

View File

@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
return llm.tensors return llm.tensors
} }
func (llm *ggla) decode(rs io.ReadSeeker) error { func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
var r uint32 var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil { if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err return err
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
for { for {
var dims uint32 var dims uint32
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil { if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
if errors.Is(err, io.EOF) {
return nil
}
return err return err
} }
defer func() {
if errors.Is(retErr, io.EOF) {
retErr = io.ErrUnexpectedEOF
}
}()
var namesize uint32 var namesize uint32
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil { if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
return err return err
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
return err return err
} }
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil { if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
return err return err
} }

View File

@@ -6,6 +6,8 @@ import (
"fmt" "fmt"
"io" "io"
"strings" "strings"
"github.com/ollama/ollama/util/bufioutil"
) )
type GGML struct { type GGML struct {
@@ -69,6 +71,30 @@ func (kv KV) HeadCountKV() uint64 {
return 1 return 1
} }
func (kv KV) EmbeddingHeadCount() uint64 {
if heads := kv.HeadCount(); heads > 0 {
return kv.EmbeddingLength() / kv.HeadCount()
}
return 0
}
func (kv KV) EmbeddingHeadCountK() uint64 {
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
return k
}
return kv.EmbeddingHeadCount()
}
func (kv KV) EmbeddingHeadCountV() uint64 {
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
return v
}
return kv.EmbeddingHeadCount()
}
func (kv KV) GQA() uint64 { func (kv KV) GQA() uint64 {
return kv.HeadCount() / kv.HeadCountKV() return kv.HeadCount() / kv.HeadCountKV()
} }
@@ -254,7 +280,18 @@ func DetectGGMLType(b []byte) string {
} }
} }
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { // DecodeGGML decodes a GGML model from the given reader.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
if maxArraySize == 0 {
maxArraySize = 1024
}
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
var magic uint32 var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil { if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, 0, err return nil, 0, err
@@ -267,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
case FILE_MAGIC_GGLA: case FILE_MAGIC_GGLA:
c = &containerGGLA{} c = &containerGGLA{}
case FILE_MAGIC_GGUF_LE: case FILE_MAGIC_GGUF_LE:
c = &containerGGUF{ByteOrder: binary.LittleEndian} c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
case FILE_MAGIC_GGUF_BE: case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian} c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
default: default:
return nil, 0, errors.New("invalid file magic") return nil, 0, errors.New("invalid file magic")
} }
model, err := c.Decode(rs) model, err := c.Decode(rs)
if errors.Is(err, io.EOF) { if err != nil {
// noop
} else if err != nil {
return nil, 0, err return nil, 0, err
} }
@@ -297,7 +332,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
embedding := llm.KV().EmbeddingLength() embedding := llm.KV().EmbeddingLength()
heads := llm.KV().HeadCount() heads := llm.KV().HeadCount()
headsKV := llm.KV().HeadCountKV() headsKV := llm.KV().HeadCountKV()
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any))) vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
embeddingHeads := llm.KV().EmbeddingHeadCount()
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
layers := llm.Tensors().Layers() layers := llm.Tensors().Layers()
@@ -308,7 +346,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
partialOffload = 4 * batch * embedding partialOffload = 4 * batch * embedding
partialOffload += max( partialOffload += max(
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()), // 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV), 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128, 4*batch*(embedding+vocab)+embedding*vocab*105/128,
) )
@@ -316,15 +354,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
// mixtral 8x22b // mixtral 8x22b
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32)) ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
partialOffload = max( partialOffload = max(
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV), 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch), 4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
) )
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok { } else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
// mixtral 8x7b // mixtral 8x7b
ffnGateWeight1 := ffnGateWeight.Shape[1] ffnGateWeight1 := ffnGateWeight.Shape[1]
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1) fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
partialOffload = max( partialOffload = max(
4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16, 4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16), 4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
) )
} }
@@ -368,15 +406,14 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
fullOffload, fullOffload,
) )
case "deepseek2": case "deepseek2":
keys := uint64(llm.KV()["deepseek2.attention.key_length"].(uint32))
fullOffload = max( fullOffload = max(
4*batch*(3*embedding+vocab), 4*batch*(3*embedding+vocab),
4*batch*(3*embedding+2+context*(1+headsKV)+2*keys*headsKV), 4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
) )
partialOffload = max( partialOffload = max(
4*batch*(3*embedding+vocab)+embedding*vocab*105/128, 4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
4*batch*(2*embedding+1+2*keys*headsKV+context+context*headsKV)+4*keys*context*headsKV+embedding*keys*headsKV*9/16, 4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
) )
} }

1
llm/ggml_test.go Normal file
View File

@@ -0,0 +1 @@
package llm

View File

@@ -3,11 +3,10 @@ package llm
import ( import (
"bytes" "bytes"
"encoding/binary" "encoding/binary"
"encoding/json"
"fmt" "fmt"
"io" "io"
"strings" "strings"
"log/slog"
) )
type containerGGUF struct { type containerGGUF struct {
@@ -29,6 +28,12 @@ type containerGGUF struct {
NumTensor uint64 NumTensor uint64
NumKV uint64 NumKV uint64
} }
maxArraySize int
}
func (c *containerGGUF) canCollectArray(size int) bool {
return c.maxArraySize < 0 || size <= c.maxArraySize
} }
func (c *containerGGUF) Name() string { func (c *containerGGUF) Name() string {
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
} }
model := newGGUF(c) model := newGGUF(c)
slog.Debug(fmt.Sprintf("model = %#v", model))
if err := model.Decode(rs); err != nil { if err := model.Decode(rs); err != nil {
return nil, err return nil, err
} }
@@ -85,6 +89,8 @@ type gguf struct {
tensors []*Tensor tensors []*Tensor
parameters uint64 parameters uint64
scratch [16 << 10]byte
} }
func newGGUF(container *containerGGUF) *gguf { func newGGUF(container *containerGGUF) *gguf {
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
} }
// decode tensors // decode tensors
for i := 0; uint64(i) < llm.numTensor(); i++ { for range llm.numTensor() {
name, err := readGGUFString(llm, rs) name, err := readGGUFString(llm, rs)
if err != nil { if err != nil {
return err return fmt.Errorf("failed to read tensor name: %w", err)
} }
// dims is the number of dimensions in the tensor // dims is the number of dimensions in the tensor
dims, err := readGGUF[uint32](llm, rs) dims, err := readGGUF[uint32](llm, rs)
if err != nil { if err != nil {
return err return fmt.Errorf("failed to read tensor dimensions: %w", err)
} }
shape := [4]uint64{1, 1, 1, 1} shape := [4]uint64{1, 1, 1, 1}
for i := 0; uint32(i) < dims; i++ { for i := 0; uint32(i) < dims; i++ {
shape[i], err = readGGUF[uint64](llm, rs) shape[i], err = readGGUF[uint64](llm, rs)
if err != nil { if err != nil {
return err return fmt.Errorf("failed to read tensor shape: %w", err)
} }
} }
kind, err := readGGUF[uint32](llm, rs) kind, err := readGGUF[uint32](llm, rs)
if err != nil { if err != nil {
return err return fmt.Errorf("failed to read tensor kind: %w", err)
} }
offset, err := readGGUF[uint64](llm, rs) offset, err := readGGUF[uint64](llm, rs)
if err != nil { if err != nil {
return err return fmt.Errorf("failed to read tensor offset: %w", err)
} }
tensor := Tensor{ tensor := Tensor{
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
alignment = 32 alignment = 32
} }
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
for _, tensor := range llm.tensors { for _, tensor := range llm.tensors {
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil { offset, err := rs.Seek(0, io.SeekCurrent)
return err if err != nil {
return fmt.Errorf("failed to get current offset: %w", err)
} }
padding := llm.padding(int64(tensor.Size()), int64(alignment)) padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil { if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err return fmt.Errorf("failed to seek to init padding: %w", err)
}
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
return fmt.Errorf("failed to seek to tensor: %w", err)
} }
} }
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
return b.String(), nil return b.String(), nil
} }
func discardGGUFString(llm *gguf, r io.Reader) error {
buf := llm.scratch[:8]
_, err := io.ReadFull(r, buf)
if err != nil {
return err
}
size := int(llm.ByteOrder.Uint64(buf))
for size > 0 {
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
if err != nil {
return err
}
size -= n
}
return nil
}
func readGGUFString(llm *gguf, r io.Reader) (string, error) { func readGGUFString(llm *gguf, r io.Reader) (string, error) {
if llm.Version == 1 { if llm.Version == 1 {
return readGGUFV1String(llm, r) return readGGUFV1String(llm, r)
} }
var length uint64 buf := llm.scratch[:8]
if err := binary.Read(r, llm.ByteOrder, &length); err != nil { _, err := io.ReadFull(r, buf)
if err != nil {
return "", err return "", err
} }
var b bytes.Buffer length := int(llm.ByteOrder.Uint64(buf))
if _, err := io.CopyN(&b, r, int64(length)); err != nil { if length > len(llm.scratch) {
buf = make([]byte, length)
} else {
buf = llm.scratch[:length]
}
clear(buf)
_, err = io.ReadFull(r, buf)
if err != nil {
return "", err return "", err
} }
return string(buf), nil
return b.String(), nil
} }
func writeGGUFString(llm *gguf, w io.Writer, s string) error { func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
return err return err
} }
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) { type array struct {
size int
values []any
}
func (a *array) MarshalJSON() ([]byte, error) {
return json.Marshal(a.values)
}
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
t, err := readGGUF[uint32](llm, r) t, err := readGGUF[uint32](llm, r)
if err != nil { if err != nil {
return nil, err return nil, err
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err return nil, err
} }
for i := 0; uint32(i) < n; i++ { a := &array{size: int(n)}
if llm.canCollectArray(int(n)) {
a.values = make([]any, 0, int(n))
}
for i := range n {
var e any var e any
switch t { switch t {
case ggufTypeUint8: case ggufTypeUint8:
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err return nil, err
} }
a = append(a, e) if a.values != nil {
a.values[i] = e
}
} }
return return a, nil
} }
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) { func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
if llm.Version == 1 { if llm.Version == 1 {
return readGGUFV1Array(llm, r) return readGGUFV1Array(llm, r)
} }
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err return nil, err
} }
for i := 0; uint64(i) < n; i++ { a := &array{size: int(n)}
if llm.canCollectArray(int(n)) {
a.values = make([]any, int(n))
}
for i := range n {
var e any var e any
switch t { switch t {
case ggufTypeUint8: case ggufTypeUint8:
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
case ggufTypeBool: case ggufTypeBool:
e, err = readGGUF[bool](llm, r) e, err = readGGUF[bool](llm, r)
case ggufTypeString: case ggufTypeString:
e, err = readGGUFString(llm, r) if a.values != nil {
e, err = readGGUFString(llm, r)
} else {
err = discardGGUFString(llm, r)
}
default: default:
return nil, fmt.Errorf("invalid array type: %d", t) return nil, fmt.Errorf("invalid array type: %d", t)
} }
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err return nil, err
} }
a = append(a, e) if a.values != nil {
a.values[i] = e
}
} }
return return a, nil
} }
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error { func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {

View File

@@ -115,8 +115,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
slog.Warn("model missing blk.0 layer size") slog.Warn("model missing blk.0 layer size")
} }
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv // fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
// KV is proportional to the number of layers // KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount() layerSize += kv / ggml.KV().BlockCount()

View File

@@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
defer f.Close() defer f.Close()
gguf := NewGGUFV3(binary.LittleEndian) gguf := NewGGUFV3(binary.LittleEndian)
inputLayerCount := 5 inputLayerCount := 5
tensors := []Tensor{ tensors := []Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
} }
assert.Len(t, tensors, inputLayerCount+1) assert.Len(t, tensors, inputLayerCount+1)
err = gguf.Encode(f, KV{ err = gguf.Encode(f, KV{
@@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
}, tensors) }, tensors)
require.NoError(t, err) require.NoError(t, err)
ggml, err := LoadModel(f.Name()) ggml, err := LoadModel(f.Name(), 0)
require.NoError(t, err) if err != nil {
t.Fatal(err)
}
// Simple CPU scenario // Simple CPU scenario
gpus := []gpu.GpuInfo{ gpus := []gpu.GpuInfo{

305
llm/patches/07-gemma.diff Normal file
View File

@@ -0,0 +1,305 @@
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
From: Ollama maintainers <hello@ollama.com>
Date: Wed, 26 Jun 2024 16:18:09 -0700
Subject: [PATCH] Architecture support
---
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 1 deletion(-)
diff --git a/llama.cpp b/llama.cpp
index 61948751..3b4196f5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -217,6 +217,7 @@ enum llm_arch {
LLM_ARCH_INTERNLM2,
LLM_ARCH_MINICPM,
LLM_ARCH_GEMMA,
+ LLM_ARCH_GEMMA2,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_XVERSE,
@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
+ { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
@@ -464,10 +466,12 @@ enum llm_tensor {
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_GEMMA2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
+ },
+ },
{
LLM_ARCH_STARCODER2,
{
@@ -1941,6 +1963,8 @@ enum e_model {
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
+ MODEL_9B,
+ MODEL_27B,
};
static const size_t kiB = 1024;
@@ -2114,6 +2138,7 @@ struct llama_layer {
struct ggml_tensor * attn_out_norm_b;
struct ggml_tensor * attn_q_a_norm;
struct ggml_tensor * attn_kv_a_norm;
+ struct ggml_tensor * attn_post_norm;
// attention
struct ggml_tensor * wq;
@@ -2136,6 +2161,7 @@ struct llama_layer {
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
+ struct ggml_tensor * ffn_post_norm;
struct ggml_tensor * layer_out_norm;
struct ggml_tensor * layer_out_norm_b;
struct ggml_tensor * ffn_norm_exps;
@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
}
} break;
case LLM_ARCH_GEMMA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 18: model.type = e_model::MODEL_9B; break;
+ case 28: model.type = e_model::MODEL_27B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
}
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ const int64_t n_ff = hparams.n_ff;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+ }
+ } break;
case LLM_ARCH_STARCODER2:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -10614,6 +10684,123 @@ struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_gemma2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+ cb(Qcur, "Qcur_scaled", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ model.layers[il].wo, NULL,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_post_norm", il);
+
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = llm_build_norm(ctx0, sa_out, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_gemma();
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ result = llm.build_gemma2();
+ } break;
case LLM_ARCH_STARCODER2:
{
result = llm.build_starcoder2();
@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA:
+ case LLM_ARCH_GEMMA2:
case LLM_ARCH_STARCODER2:
case LLM_ARCH_GPTNEOX:
return LLAMA_ROPE_TYPE_NEOX;
@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<s>assistant\n";
}
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
// google/gemma-7b-it
std::string system_prompt = "";
for (auto message : chat) {
--
2.45.2

View File

@@ -60,7 +60,12 @@ type llmServer struct {
sem *semaphore.Weighted sem *semaphore.Weighted
} }
func LoadModel(model string) (*GGML, error) { // LoadModel will load a model from disk. The model must be in the GGML format.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func LoadModel(model string, maxArraySize int) (*GGML, error) {
if _, err := os.Stat(model); err != nil { if _, err := os.Stat(model); err != nil {
return nil, err return nil, err
} }
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
} }
defer f.Close() defer f.Close()
ggml, _, err := DecodeGGML(f) ggml, _, err := DecodeGGML(f, maxArraySize)
return ggml, err return ggml, err
} }
@@ -81,7 +86,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var err error var err error
var cpuRunner string var cpuRunner string
var estimate MemoryEstimate var estimate MemoryEstimate
var systemMemory uint64 var systemTotalMemory uint64
var systemFreeMemory uint64
systemMemInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
systemTotalMemory = systemMemInfo.TotalMemory
systemFreeMemory = systemMemInfo.FreeMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
}
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 { if opts.NumGPU == 0 {
@@ -91,19 +106,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts) estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else { } else {
if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
systemMemory = memInfo.TotalMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
}
}
estimate = EstimateGPULayers(gpus, ggml, projectors, opts) estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
switch { switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory: case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this // disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system // can lead to locking up the system
opts.NumGPU = 0 opts.NumGPU = 0
@@ -160,6 +166,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--log-disable") params = append(params, "--log-disable")
params = append(params, "--timeout", fmt.Sprintf("%d", 600))
if opts.NumGPU >= 0 { if opts.NumGPU >= 0 {
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU)) params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
} }
@@ -211,7 +219,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
// Windows CUDA should not use mmap for best performance // Windows CUDA should not use mmap for best performance
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse { // Linux with a model larger than free space, mmap leads to thrashing
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
opts.UseMMap == api.TriStateFalse {
params = append(params, "--no-mmap") params = append(params, "--no-mmap")
} }
@@ -408,7 +419,7 @@ func projectorMemoryRequirements(filename string) uint64 {
} }
defer file.Close() defer file.Close()
ggml, _, err := DecodeGGML(file) ggml, _, err := DecodeGGML(file, 0)
if err != nil { if err != nil {
return 0 return 0
} }

View File

@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
case $OS_NAME in case $OS_NAME in
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;; centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;; rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';; fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
amzn) install_cuda_driver_yum 'fedora' '37' ;; amzn) install_cuda_driver_yum 'fedora' '37' ;;
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;; debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;; ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;

View File

@@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err return err
} }
layers, err := parseFromFile(ctx, temp, "", fn) layer, err := NewLayer(temp, baseLayer.MediaType)
if err != nil { if err != nil {
return err return err
} }
if len(layers) != 1 { if _, err := temp.Seek(0, io.SeekStart); err != nil {
return errors.New("quantization failed") return err
} }
baseLayer.Layer = layers[0].Layer ggml, _, err := llm.DecodeGGML(temp, 0)
baseLayer.GGML = layers[0].GGML if err != nil {
return err
}
baseLayer.Layer = layer
baseLayer.GGML = ggml
} }
} }

View File

@@ -11,6 +11,7 @@ import (
"net/http" "net/http"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/convert" "github.com/ollama/ollama/convert"
@@ -63,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
} }
defer blob.Close() defer blob.Close()
ggml, _, err := llm.DecodeGGML(blob) ggml, _, err := llm.DecodeGGML(blob, 0)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
return layers, nil return layers, nil
} }
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
stat, err := file.Stat() stat, err := file.Stat()
if err != nil { if err != nil {
return nil, err return err
} }
r, err := zip.NewReader(file, stat.Size()) r, err := zip.NewReader(file, stat.Size())
if err != nil { if err != nil {
return nil, err return err
} }
tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempdir)
fn(api.ProgressResponse{Status: "unpacking model metadata"}) fn(api.ProgressResponse{Status: "unpacking model metadata"})
for _, f := range r.File { for _, f := range r.File {
n := filepath.Join(p, f.Name)
if !strings.HasPrefix(n, p) {
slog.Warn("skipped extracting file outside of context", "name", f.Name)
continue
}
if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
return err
}
// TODO(mxyng): this should not write out all files to disk // TODO(mxyng): this should not write out all files to disk
outfile, err := os.Create(filepath.Join(tempdir, f.Name)) outfile, err := os.Create(n)
if err != nil { if err != nil {
return nil, err return err
} }
defer outfile.Close() defer outfile.Close()
infile, err := f.Open() infile, err := f.Open()
if err != nil { if err != nil {
return nil, err return err
} }
defer infile.Close() defer infile.Close()
if _, err = io.Copy(outfile, infile); err != nil { if _, err = io.Copy(outfile, infile); err != nil {
return nil, err return err
} }
if err := outfile.Close(); err != nil { if err := outfile.Close(); err != nil {
return nil, err return err
} }
if err := infile.Close(); err != nil { if err := infile.Close(); err != nil {
return nil, err return err
} }
} }
mf, err := convert.GetModelFormat(tempdir) return nil
}
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempDir)
if err := extractFromZipFile(tempDir, file, fn); err != nil {
return nil, err
}
mf, err := convert.GetModelFormat(tempDir)
if err != nil { if err != nil {
return nil, err return nil, err
} }
params, err := mf.GetParams(tempdir) params, err := mf.GetParams(tempDir)
if err != nil { if err != nil {
return nil, err return nil, err
} }
mArch, err := mf.GetModelArch("", tempdir, params) mArch, err := mf.GetModelArch("", tempDir, params)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
// TODO(mxyng): this should write directly into a layer // TODO(mxyng): this should write directly into a layer
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model") // e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
temp, err := os.CreateTemp(tempdir, "fp16") temp, err := os.CreateTemp(tempDir, "fp16")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -176,7 +195,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
} }
defer bin.Close() defer bin.Close()
ggml, _, err := llm.DecodeGGML(bin) ggml, _, err := llm.DecodeGGML(bin, 0)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -210,7 +229,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
var offset int64 var offset int64
for offset < stat.Size() { for offset < stat.Size() {
ggml, n, err := llm.DecodeGGML(file) ggml, n, err := llm.DecodeGGML(file, 0)
if errors.Is(err, io.EOF) { if errors.Is(err, io.EOF) {
break break
} else if err != nil { } else if err != nil {

92
server/model_test.go Normal file
View File

@@ -0,0 +1,92 @@
package server
import (
"archive/zip"
"bytes"
"io"
"os"
"path/filepath"
"slices"
"testing"
"github.com/ollama/ollama/api"
)
func createZipFile(t *testing.T, name string) *os.File {
t.Helper()
f, err := os.CreateTemp(t.TempDir(), "")
if err != nil {
t.Fatal(err)
}
zf := zip.NewWriter(f)
defer zf.Close()
zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
if err != nil {
t.Fatal(err)
}
if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
t.Fatal(err)
}
return f
}
func TestExtractFromZipFile(t *testing.T) {
cases := []struct {
name string
expect []string
}{
{
name: "good",
expect: []string{"good"},
},
{
name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
f := createZipFile(t, tt.name)
defer f.Close()
tempDir := t.TempDir()
if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
t.Fatal(err)
}
var matches []string
if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
if err != nil {
return err
}
if !fi.IsDir() {
matches = append(matches, p)
}
return nil
}); err != nil {
t.Fatal(err)
}
var actual []string
for _, match := range matches {
rel, err := filepath.Rel(tempDir, match)
if err != nil {
t.Error(err)
}
actual = append(actual, rel)
}
if !slices.Equal(actual, tt.expect) {
t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
}
})
}
}

View File

@@ -754,7 +754,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
} }
func getKVData(digest string, verbose bool) (llm.KV, error) { func getKVData(digest string, verbose bool) (llm.KV, error) {
kvData, err := llm.LoadModel(digest) maxArraySize := 0
if verbose {
maxArraySize = -1
}
kvData, err := llm.LoadModel(digest, maxArraySize)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -1101,11 +1105,20 @@ func Serve(ln net.Listener) error {
schedCtx, schedDone := context.WithCancel(ctx) schedCtx, schedDone := context.WithCancel(ctx)
sched := InitScheduler(schedCtx) sched := InitScheduler(schedCtx)
s := &Server{addr: ln.Addr(), sched: sched} s := &Server{addr: ln.Addr(), sched: sched}
r := s.GenerateRoutes()
http.Handle("/", s.GenerateRoutes())
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version)) slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
srvr := &http.Server{ srvr := &http.Server{
Handler: r, // Use http.DefaultServeMux so we get net/http/pprof for
// free.
//
// TODO(bmizerany): Decide if we want to make this
// configurable so it is not exposed by default, or allow
// users to bind it to a different port. This was a quick
// and easy way to get pprof, but it may not be the best
// way.
Handler: nil,
} }
// listen for a ctrl+c and stop any loaded llm // listen for a ctrl+c and stop any loaded llm

View File

@@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
} }
// Load model for fitting // Load model for fitting
ggml, err := llm.LoadModel(pending.model.ModelPath) ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
if err != nil { if err != nil {
pending.errCh <- err pending.errCh <- err
break break

View File

@@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
"tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0}, "tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{ }, []llm.Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
}) })
require.NoError(t, err) require.NoError(t, err)
fname := f.Name() fname := f.Name()
model := &Model{Name: modelName, ModelPath: fname} model := &Model{Name: modelName, ModelPath: fname}
scenario.ggml, err = llm.LoadModel(model.ModelPath) scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
require.NoError(t, err) require.NoError(t, err)
scenario.req = &LlmRequest{ scenario.req = &LlmRequest{

View File

@@ -0,0 +1,34 @@
package bufioutil
import (
"bufio"
"io"
)
type BufferedSeeker struct {
rs io.ReadSeeker
br *bufio.Reader
}
func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
return &BufferedSeeker{
rs: rs,
br: bufio.NewReaderSize(rs, size),
}
}
func (b *BufferedSeeker) Read(p []byte) (int, error) {
return b.br.Read(p)
}
func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
if whence == io.SeekCurrent {
offset -= int64(b.br.Buffered())
}
n, err := b.rs.Seek(offset, whence)
if err != nil {
return 0, err
}
b.br.Reset(b.rs)
return n, nil
}

View File

@@ -0,0 +1,64 @@
package bufioutil
import (
"bytes"
"io"
"strings"
"testing"
)
func TestBufferedSeeker(t *testing.T) {
const alphabet = "abcdefghijklmnopqrstuvwxyz"
bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
checkRead := func(buf []byte, expected string) {
t.Helper()
_, err := bs.Read(buf)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(buf, []byte(expected)) {
t.Fatalf("expected %s, got %s", expected, buf)
}
}
// Read the first 5 bytes
buf := make([]byte, 5)
checkRead(buf, "abcde")
// Seek back to the beginning
_, err := bs.Seek(0, io.SeekStart)
if err != nil {
t.Fatal(err)
}
// read 'a'
checkRead(buf[:1], "a")
if bs.br.Buffered() == 0 {
t.Fatalf("totally unexpected sanity check failed")
}
// Seek past 'b'
_, err = bs.Seek(1, io.SeekCurrent)
if err != nil {
t.Fatal(err)
}
checkRead(buf, "cdefg")
// Seek back to the beginning
_, err = bs.Seek(0, io.SeekStart)
if err != nil {
t.Fatal(err)
}
checkRead(buf, "abcde")
// Seek to the end
_, err = bs.Seek(-5, io.SeekEnd)
if err != nil {
t.Fatal(err)
}
checkRead(buf, "vwxyz")
}