Compare commits

..

8 Commits

Author SHA1 Message Date
Josh Yan
4da5d5beaa lint 2024-08-28 10:23:41 -07:00
Josh Yan
cc17b02b23 update 2024-08-28 09:58:23 -07:00
Josh Yan
73d69bc90b remove types 2024-08-27 16:45:07 -07:00
Josh Yan
9bc42f532b rmv api type 2024-08-27 16:45:07 -07:00
Josh Yan
07c0f66f5e rm print 2024-08-27 16:45:04 -07:00
Josh Yan
4a7bfca902 change progress msg 2024-08-27 16:44:38 -07:00
Josh Yan
04f2154505 fixed cgo 2024-08-27 16:44:38 -07:00
Josh Yan
de9b21b472 quantize progress 2024-08-27 16:44:32 -07:00
15 changed files with 1361 additions and 159 deletions

1
.gitattributes vendored
View File

@@ -1,3 +1,4 @@
llm/ext_server/* linguist-vendored
llm/*.h linguist-vendored
* text=auto
*.go text eol=lf

View File

@@ -124,7 +124,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
}
bars := make(map[string]*progress.Bar)
var convertSpin *progress.Spinner
var quantizeSpin *progress.Spinner
fn := func(resp api.ProgressResponse) error {
if resp.Digest != "" {
spinner.Stop()
@@ -137,15 +137,14 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
}
bar.Set(resp.Completed)
} else if strings.Contains(resp.Status, "converting") {
} else if strings.Contains(resp.Status, "quantizing") {
spinner.Stop()
if convertSpin != nil {
convertSpin.SetMessage(resp.Status)
if quantizeSpin != nil {
quantizeSpin.SetMessage(resp.Status)
} else {
status = resp.Status
convertSpin = progress.NewSpinner(resp.Status)
p.Add("convert", convertSpin)
quantizeSpin = progress.NewSpinner(resp.Status)
p.Add("quantize", quantizeSpin)
}
} else if status != resp.Status {
spinner.Stop()

View File

@@ -9,7 +9,6 @@ import (
"log/slog"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
)
@@ -80,12 +79,12 @@ func (ModelParameters) specialTokenTypes() []string {
}
}
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor, fn func(api.ProgressResponse)) error {
return llm.WriteGGUF(ws, kv, ts, fn)
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts)
}
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor, fn func(api.ProgressResponse)) error {
return llm.WriteGGUF(ws, kv, ts, fn)
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts)
}
type ModelConverter interface {
@@ -100,7 +99,7 @@ type ModelConverter interface {
// specialTokenTypes returns any special token types the model uses
specialTokenTypes() []string
// writeFile writes the model to the provided io.WriteSeeker
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor, func(api.ProgressResponse)) error
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
}
type moreParser interface {
@@ -116,10 +115,10 @@ type AdapterConverter interface {
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor, func(api.ProgressResponse)) error
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
}
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV, fn func(api.ProgressResponse)) error {
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
bts, err := fs.ReadFile(fsys, "adapter_config.json")
if err != nil {
return err
@@ -154,17 +153,14 @@ func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV, fn func(api.Pr
return err
}
fn(api.ProgressResponse{
Status: fmt.Sprintf("converting adapter 0%%"),
})
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts), fn)
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
}
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
// and files it finds in the input path.
// Supported input model formats include safetensors.
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
func ConvertModel(fsys fs.FS, ws io.WriteSeeker, fn func(api.ProgressResponse)) error {
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
bts, err := fs.ReadFile(fsys, "config.json")
if err != nil {
return err
@@ -228,8 +224,5 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker, fn func(api.ProgressResponse))
return err
}
fn(api.ProgressResponse{
Status: fmt.Sprintf("converting model 0%%"),
})
return conv.writeFile(ws, conv.KV(t), conv.Tensors(ts), fn)
return conv.writeFile(ws, conv.KV(t), conv.Tensors(ts))
}

View File

@@ -19,7 +19,6 @@ import (
"golang.org/x/exp/maps"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
)
@@ -32,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
}
defer f.Close()
if err := ConvertModel(fsys, f, func(api.ProgressResponse) {}); err != nil {
if err := ConvertModel(fsys, f); err != nil {
t.Fatal(err)
}
@@ -141,107 +140,6 @@ func TestConvertFull(t *testing.T) {
}
}
func TestConvertInvalidDatatype(t *testing.T) {
f, err := os.CreateTemp(t.TempDir(), "testmodel")
if err != nil {
t.Fatal(err)
}
defer f.Close()
tempDir := t.TempDir()
generateSafetensorTestData(t, tempDir)
err = ConvertModel(os.DirFS(tempDir), f, func(api.ProgressResponse) {})
if err == nil || err.Error() != "unsupported safetensors model" {
t.Errorf("expected error but didn't get one")
}
}
func generateSafetensorTestData(t *testing.T, tempDir string) {
type tensorData struct {
Offsets []int `json:"data_offsets"`
Type string `json:"dtype"`
Shape []int `json:"shape"`
}
offset := 4096 * 14336
td := map[string]*tensorData{}
td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
Offsets: []int{0, offset},
Type: "I8",
Shape: []int{4096, 14336},
}
td["model.layers.0.mlp.down_proj.weight_format"] = &tensorData{
Offsets: []int{offset, offset},
Type: "U8",
Shape: []int{},
}
data, err := json.Marshal(td)
if err != nil {
t.Fatal(err)
}
var buf bytes.Buffer
l := int64(len(data))
err = binary.Write(&buf, binary.LittleEndian, l)
if err != nil {
t.Fatal(err)
}
_, err = buf.Write(data)
if err != nil {
t.Fatal(err)
}
fdata, err := os.Create(filepath.Join(tempDir, "model-00001-of-00001.safetensors"))
if err != nil {
t.Fatal(err)
}
defer fdata.Close()
_, err = fdata.Write(buf.Bytes())
if err != nil {
t.Fatal(err)
}
configData := `
{
"architectures": [
"LlamaForCausalLM"
]
}
`
f, err := os.Create(filepath.Join(tempDir, "config.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(configData)
if err != nil {
t.Fatal(err)
}
tokenizerData := `
{
}
`
f, err = os.Create(filepath.Join(tempDir, "tokenizer.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(tokenizerData)
if err != nil {
t.Fatal(err)
}
}
func TestConvertAdapter(t *testing.T) {
type AdapterCase struct {
Name string
@@ -288,7 +186,7 @@ func TestConvertAdapter(t *testing.T) {
tempDir := t.TempDir()
generateLoraTestData(t, tempDir)
if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV, func(api.ProgressResponse) {}); err != nil {
if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
t.Fatal(err)
}

View File

@@ -4,7 +4,6 @@ import (
"bytes"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io"
"io/fs"
@@ -51,10 +50,6 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
for _, key := range keys {
if value := headers[key]; value.Type != "" {
// bitsandbytes quantized models are unsupported
if len(value.Shape) == 0 {
return nil, errors.New("unsupported safetensors model")
}
ts = append(ts, safetensor{
fs: fsys,
path: p,

View File

@@ -12,8 +12,6 @@ import (
"strings"
"golang.org/x/exp/maps"
"github.com/ollama/ollama/api"
)
type containerGGUF struct {
@@ -508,7 +506,7 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
return binary.Write(w, binary.LittleEndian, s)
}
func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor, fn func(api.ProgressResponse)) error {
func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
return err
}
@@ -554,10 +552,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor, fn func(api.ProgressRespon
}
var alignment int64 = 32
for i, t := range ts {
fn(api.ProgressResponse{
Status: fmt.Sprintf("converting model %d%%", 100*(i+1)/len(ts)),
})
for _, t := range ts {
if err := ggufWriteTensor(ws, t, alignment); err != nil {
return err
}

1227
llm/llama.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
package llm
// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
// #cgo CPPFLAGS: -Illama.cpp/ggml/include
// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
@@ -9,12 +9,24 @@ package llm
// #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
// #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
// #include <stdlib.h>
// #include <stdatomic.h>
// #include "llama.h"
// bool update_quantize_progress(float progress, void* data) {
// atomic_int* atomicData = (atomic_int*)data;
// int intProgress = *((int*)&progress);
// atomic_store(atomicData, intProgress);
// return true;
// }
import "C"
import (
"errors"
"fmt"
"sync/atomic"
"time"
"unsafe"
"github.com/ollama/ollama/api"
)
// SystemInfo is an unused example of calling llama.cpp functions using CGo
@@ -22,17 +34,49 @@ func SystemInfo() string {
return C.GoString(C.llama_print_system_info())
}
func Quantize(infile, outfile string, ftype fileType) error {
func Quantize(infile, outfile string, ftype fileType, fn func(resp api.ProgressResponse), tensorCount int) error {
cinfile := C.CString(infile)
defer C.free(unsafe.Pointer(cinfile))
coutfile := C.CString(outfile)
defer C.free(unsafe.Pointer(coutfile))
params := C.llama_model_quantize_default_params()
params.nthread = -1
params.ftype = ftype.Value()
// Initialize "global" to store progress
store := (*int32)(C.malloc(C.sizeof_int))
defer C.free(unsafe.Pointer(store))
// Initialize store value, e.g., setting initial progress to 0
atomic.StoreInt32(store, 0)
params.quantize_callback_data = unsafe.Pointer(store)
params.quantize_callback = (C.llama_progress_callback)(C.update_quantize_progress)
ticker := time.NewTicker(30 * time.Millisecond)
done := make(chan struct{})
defer close(done)
go func() {
defer ticker.Stop()
for {
select {
case <-ticker.C:
progressInt := atomic.LoadInt32(store)
progress := *(*float32)(unsafe.Pointer(&progressInt))
fn(api.ProgressResponse{
Status: fmt.Sprintf("quantizing model %d%%", 100*int(progress)/tensorCount),
})
case <-done:
fn(api.ProgressResponse{
Status: fmt.Sprintf("quantizing model 100%%"),
})
return
}
}
}()
if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
return errors.New("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
}

View File

@@ -41,7 +41,7 @@ func TestEstimateGPULayers(t *testing.T) {
"tokenizer.ggml.tokens": []string{" "},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, tensors, func(api.ProgressResponse) {})
}, tensors)
require.NoError(t, err)
ggml, err := LoadModel(f.Name(), 0)

View File

@@ -0,0 +1,52 @@
From ed941590d59fc07b1ad21d6aa458588e47d1e446 Mon Sep 17 00:00:00 2001
From: Josh Yan <jyan00017@gmail.com>
Date: Wed, 10 Jul 2024 13:39:39 -0700
Subject: [PATCH] quantize progress
---
include/llama.h | 3 +++
src/llama.cpp | 8 ++++++++
2 files changed, 11 insertions(+)
diff --git a/include/llama.h b/include/llama.h
index bb4b05ba..613db68e 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -349,6 +349,9 @@ extern "C" {
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
+
+ llama_progress_callback quantize_callback; // callback to report quantization progress
+ void * quantize_callback_data; // user data for the callback
} llama_model_quantize_params;
// grammar types
diff --git a/src/llama.cpp b/src/llama.cpp
index 2b9ace28..ac640c02 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18252,6 +18252,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const auto tn = LLM_TN(model.arch);
new_ofstream(0);
for (int i = 0; i < ml.n_tensors; ++i) {
+ if (params->quantize_callback){
+ if (!params->quantize_callback(i, params->quantize_callback_data)) {
+ return;
+ }
+ }
+
auto weight = ml.get_weight(i);
struct ggml_tensor * tensor = weight->tensor;
if (weight->idx != cur_split && params->keep_split) {
@@ -18789,6 +18795,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.keep_split =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
+ /*.quantize_callback =*/ nullptr,
+ /*.quantize_callback_data =*/ nullptr,
};
return result;
--
2.39.3 (Apple Git-146)

View File

@@ -435,11 +435,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err
}
tensorCount := len(baseLayer.GGML.Tensors().Items)
ft := baseLayer.GGML.KV().FileType()
if !slices.Contains([]string{"F16", "F32"}, ft.String()) {
return errors.New("quantization is only supported for F16 and F32 models")
} else if want != ft {
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantization)})
fn(api.ProgressResponse{
Status: "quantizing model tensors",
})
blob, err := GetBlobsPath(baseLayer.Digest)
if err != nil {
@@ -453,7 +456,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
defer temp.Close()
defer os.Remove(temp.Name())
if err := llm.Quantize(blob, temp.Name(), want); err != nil {
if err := llm.Quantize(blob, temp.Name(), want, fn, tensorCount); err != nil {
return err
}

View File

@@ -98,6 +98,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
}
defer os.RemoveAll(p)
fn(api.ProgressResponse{Status: "converting model"})
// TODO(mxyng): this should write directly into a layer
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
t, err := os.CreateTemp(p, "fp16")
@@ -122,18 +123,13 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
if baseModel == nil {
return nil, fmt.Errorf("no base model specified for the adapter")
}
fn(api.ProgressResponse{
Status: "converting adapter",
})
if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV(), fn); err != nil {
if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
return nil, err
}
layerType = "application/vnd.ollama.image.adapter"
case "model":
fn(api.ProgressResponse{
Status: "converting model",
})
if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t, fn); err != nil {
if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
return nil, err
}
layerType = "application/vnd.ollama.image.model"

View File

@@ -145,7 +145,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
t.Fatalf("failed to open file: %v", err)
}
defer file.Close()
if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}, func(api.ProgressResponse) {}); err != nil {
if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
t.Fatalf("failed to write gguf: %v", err)
}
@@ -197,7 +197,7 @@ func TestParseLayerFromCopy(t *testing.T) {
defer file2.Close()
for range 5 {
if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}, func(api.ProgressResponse) {}); err != nil {
if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
t.Fatalf("failed to write gguf: %v", err)
}
}

View File

@@ -30,7 +30,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
}
defer f.Close()
if err := llm.WriteGGUF(f, kv, ti, func(api.ProgressResponse) {}); err != nil {
if err := llm.WriteGGUF(f, kv, ti); err != nil {
t.Fatal(err)
}

View File

@@ -128,8 +128,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
}, []llm.Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
},
func(api.ProgressResponse) {}))
}))
require.NoError(t, err)
fname := f.Name()