lint

update
remove types
2024-08-28 10:23:41 -07:00 · 2024-08-28 09:58:23 -07:00 · 2024-08-27 16:45:07 -07:00 · 2024-08-27 16:45:07 -07:00 · 2024-08-27 16:45:04 -07:00 · 2024-08-27 16:44:38 -07:00
15 changed files with 1361 additions and 159 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,4 @@
 llm/ext_server/* linguist-vendored
+llm/*.h linguist-vendored
 * text=auto
 *.go text eol=lf
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -124,7 +124,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	}

 	bars := make(map[string]*progress.Bar)
-	var convertSpin *progress.Spinner
+	var quantizeSpin *progress.Spinner
 	fn := func(resp api.ProgressResponse) error {
 		if resp.Digest != "" {
 			spinner.Stop()
@@ -137,15 +137,14 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 			}

 			bar.Set(resp.Completed)
-		} else if strings.Contains(resp.Status, "converting") {
+		} else if strings.Contains(resp.Status, "quantizing") {
 			spinner.Stop()

-			if convertSpin != nil {
-				convertSpin.SetMessage(resp.Status)
+			if quantizeSpin != nil {
+				quantizeSpin.SetMessage(resp.Status)
 			} else {
-				status = resp.Status
-				convertSpin = progress.NewSpinner(resp.Status)
-				p.Add("convert", convertSpin)
+				quantizeSpin = progress.NewSpinner(resp.Status)
+				p.Add("quantize", quantizeSpin)
 			}
 		} else if status != resp.Status {
 			spinner.Stop()
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -9,7 +9,6 @@ import (
 	"log/slog"
 	"strings"

-	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
 )

@@ -80,12 +79,12 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }

-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor, fn func(api.ProgressResponse)) error {
-	return llm.WriteGGUF(ws, kv, ts, fn)
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+	return llm.WriteGGUF(ws, kv, ts)
 }

-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor, fn func(api.ProgressResponse)) error {
-	return llm.WriteGGUF(ws, kv, ts, fn)
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+	return llm.WriteGGUF(ws, kv, ts)
 }

 type ModelConverter interface {
@@ -100,7 +99,7 @@ type ModelConverter interface {
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
 	// writeFile writes the model to the provided io.WriteSeeker
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor, func(api.ProgressResponse)) error
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }

 type moreParser interface {
@@ -116,10 +115,10 @@ type AdapterConverter interface {
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string

-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor, func(api.ProgressResponse)) error
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }

-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV, fn func(api.ProgressResponse)) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -154,17 +153,14 @@ func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV, fn func(api.Pr
 		return err
 	}

-	fn(api.ProgressResponse{
-		Status: fmt.Sprintf("converting adapter 0%%"),
-	})
-	return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts), fn)
+	return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
 }

 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, ws io.WriteSeeker, fn func(api.ProgressResponse)) error {
+func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
@@ -228,8 +224,5 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker, fn func(api.ProgressResponse))
 		return err
 	}

-	fn(api.ProgressResponse{
-		Status: fmt.Sprintf("converting model 0%%"),
-	})
-	return conv.writeFile(ws, conv.KV(t), conv.Tensors(ts), fn)
+	return conv.writeFile(ws, conv.KV(t), conv.Tensors(ts))
 }
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -19,7 +19,6 @@ import (

 	"golang.org/x/exp/maps"

-	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
 )

@@ -32,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	}
 	defer f.Close()

-	if err := ConvertModel(fsys, f, func(api.ProgressResponse) {}); err != nil {
+	if err := ConvertModel(fsys, f); err != nil {
 		t.Fatal(err)
 	}

@@ -141,107 +140,6 @@ func TestConvertFull(t *testing.T) {
 	}
 }

-func TestConvertInvalidDatatype(t *testing.T) {
-	f, err := os.CreateTemp(t.TempDir(), "testmodel")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	tempDir := t.TempDir()
-	generateSafetensorTestData(t, tempDir)
-
-	err = ConvertModel(os.DirFS(tempDir), f, func(api.ProgressResponse) {})
-	if err == nil || err.Error() != "unsupported safetensors model" {
-		t.Errorf("expected error but didn't get one")
-	}
-}
-
-func generateSafetensorTestData(t *testing.T, tempDir string) {
-	type tensorData struct {
-		Offsets []int  `json:"data_offsets"`
-		Type    string `json:"dtype"`
-		Shape   []int  `json:"shape"`
-	}
-	offset := 4096 * 14336
-
-	td := map[string]*tensorData{}
-	td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
-		Offsets: []int{0, offset},
-		Type:    "I8",
-		Shape:   []int{4096, 14336},
-	}
-	td["model.layers.0.mlp.down_proj.weight_format"] = &tensorData{
-		Offsets: []int{offset, offset},
-		Type:    "U8",
-		Shape:   []int{},
-	}
-
-	data, err := json.Marshal(td)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var buf bytes.Buffer
-
-	l := int64(len(data))
-	err = binary.Write(&buf, binary.LittleEndian, l)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	_, err = buf.Write(data)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	fdata, err := os.Create(filepath.Join(tempDir, "model-00001-of-00001.safetensors"))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer fdata.Close()
-
-	_, err = fdata.Write(buf.Bytes())
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	configData := `
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ]
-}
-`
-
-	f, err := os.Create(filepath.Join(tempDir, "config.json"))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	_, err = f.WriteString(configData)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	tokenizerData := `
-{
-}
-`
-
-	f, err = os.Create(filepath.Join(tempDir, "tokenizer.json"))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	_, err = f.WriteString(tokenizerData)
-	if err != nil {
-		t.Fatal(err)
-	}
-}
-
 func TestConvertAdapter(t *testing.T) {
 	type AdapterCase struct {
 		Name     string
@@ -288,7 +186,7 @@ func TestConvertAdapter(t *testing.T) {
 			tempDir := t.TempDir()
 			generateLoraTestData(t, tempDir)

-			if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV, func(api.ProgressResponse) {}); err != nil {
+			if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
 				t.Fatal(err)
 			}

--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"io"
 	"io/fs"
@@ -51,10 +50,6 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T

 		for _, key := range keys {
 			if value := headers[key]; value.Type != "" {
-				// bitsandbytes quantized models are unsupported
-				if len(value.Shape) == 0 {
-					return nil, errors.New("unsupported safetensors model")
-				}
 				ts = append(ts, safetensor{
 					fs:     fsys,
 					path:   p,
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -12,8 +12,6 @@ import (
 	"strings"

 	"golang.org/x/exp/maps"
-
-	"github.com/ollama/ollama/api"
 )

 type containerGGUF struct {
@@ -508,7 +506,7 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 	return binary.Write(w, binary.LittleEndian, s)
 }

-func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor, fn func(api.ProgressResponse)) error {
+func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
 		return err
 	}
@@ -554,10 +552,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor, fn func(api.ProgressRespon
 	}

 	var alignment int64 = 32
-	for i, t := range ts {
-		fn(api.ProgressResponse{
-			Status: fmt.Sprintf("converting model %d%%", 100*(i+1)/len(ts)),
-		})
+	for _, t := range ts {
 		if err := ggufWriteTensor(ws, t, alignment); err != nil {
 			return err
 		}
--- a/llm/llama.h
+++ b/llm/llama.h
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,6 +1,6 @@
 package llm

-// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
+// #cgo CPPFLAGS: -Illama.cpp/ggml/include
 // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
 // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
 // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
@@ -9,12 +9,24 @@ package llm
 // #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
 // #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
 // #include <stdlib.h>
+// #include <stdatomic.h>
 // #include "llama.h"
+// bool update_quantize_progress(float progress, void* data) {
+//	atomic_int* atomicData = (atomic_int*)data;
+//  int intProgress = *((int*)&progress);
+//  atomic_store(atomicData, intProgress);
+//  return true;
+// }
 import "C"

 import (
 	"errors"
+	"fmt"
+	"sync/atomic"
+	"time"
 	"unsafe"
+
+	"github.com/ollama/ollama/api"
 )

 // SystemInfo is an unused example of calling llama.cpp functions using CGo
@@ -22,17 +34,49 @@ func SystemInfo() string {
 	return C.GoString(C.llama_print_system_info())
 }

-func Quantize(infile, outfile string, ftype fileType) error {
+func Quantize(infile, outfile string, ftype fileType, fn func(resp api.ProgressResponse), tensorCount int) error {
 	cinfile := C.CString(infile)
 	defer C.free(unsafe.Pointer(cinfile))

 	coutfile := C.CString(outfile)
 	defer C.free(unsafe.Pointer(coutfile))
-
 	params := C.llama_model_quantize_default_params()
 	params.nthread = -1
 	params.ftype = ftype.Value()

+	// Initialize "global" to store progress
+	store := (*int32)(C.malloc(C.sizeof_int))
+	defer C.free(unsafe.Pointer(store))
+
+	// Initialize store value, e.g., setting initial progress to 0
+	atomic.StoreInt32(store, 0)
+
+	params.quantize_callback_data = unsafe.Pointer(store)
+	params.quantize_callback = (C.llama_progress_callback)(C.update_quantize_progress)
+
+	ticker := time.NewTicker(30 * time.Millisecond)
+	done := make(chan struct{})
+	defer close(done)
+
+	go func() {
+		defer ticker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				progressInt := atomic.LoadInt32(store)
+				progress := *(*float32)(unsafe.Pointer(&progressInt))
+				fn(api.ProgressResponse{
+					Status: fmt.Sprintf("quantizing model %d%%", 100*int(progress)/tensorCount),
+				})
+			case <-done:
+				fn(api.ProgressResponse{
+					Status: fmt.Sprintf("quantizing model 100%%"),
+				})
+				return
+			}
+		}
+	}()
+
 	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
 		return errors.New("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
 	}
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -41,7 +41,7 @@ func TestEstimateGPULayers(t *testing.T) {
 		"tokenizer.ggml.tokens":         []string{" "},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
-	}, tensors, func(api.ProgressResponse) {})
+	}, tensors)
 	require.NoError(t, err)

 	ggml, err := LoadModel(f.Name(), 0)
--- a/llm/patches/10-quantize-callback.diff
+++ b/llm/patches/10-quantize-callback.diff
@@ -0,0 +1,52 @@
+From ed941590d59fc07b1ad21d6aa458588e47d1e446 Mon Sep 17 00:00:00 2001
+From: Josh Yan <jyan00017@gmail.com>
+Date: Wed, 10 Jul 2024 13:39:39 -0700
+Subject: [PATCH] quantize progress
+
+---
+ include/llama.h | 3 +++
+ src/llama.cpp   | 8 ++++++++
+ 2 files changed, 11 insertions(+)
+
+diff --git a/include/llama.h b/include/llama.h
+index bb4b05ba..613db68e 100644
+--- a/include/llama.h
+++ b/include/llama.h
+@@ -349,6 +349,9 @@ extern "C" {
+         bool keep_split;                     // quantize to the same number of shards
+         void * imatrix;                      // pointer to importance matrix data
+         void * kv_overrides;                 // pointer to vector containing overrides
+
+        llama_progress_callback quantize_callback;   // callback to report quantization progress
+        void * quantize_callback_data;               // user data for the callback
+     } llama_model_quantize_params;
+ 
+     // grammar types
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 2b9ace28..ac640c02 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -18252,6 +18252,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+     const auto tn = LLM_TN(model.arch);
+     new_ofstream(0);
+     for (int i = 0; i < ml.n_tensors; ++i) {
+        if (params->quantize_callback){
+            if (!params->quantize_callback(i, params->quantize_callback_data)) {
+                return;
+            }
+        }
+
+         auto weight = ml.get_weight(i);
+         struct ggml_tensor * tensor = weight->tensor;
+         if (weight->idx != cur_split && params->keep_split) {
+@@ -18789,6 +18795,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
+         /*.keep_split                  =*/ false,
+         /*.imatrix                     =*/ nullptr,
+         /*.kv_overrides                =*/ nullptr,
+        /*.quantize_callback           =*/ nullptr,
+        /*.quantize_callback_data      =*/ nullptr,
+     };
+ 
+     return result;
+-- 
+2.39.3 (Apple Git-146)
--- a/server/images.go
+++ b/server/images.go
@@ -435,11 +435,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 						return err
 					}

+					tensorCount := len(baseLayer.GGML.Tensors().Items)
 					ft := baseLayer.GGML.KV().FileType()
 					if !slices.Contains([]string{"F16", "F32"}, ft.String()) {
 						return errors.New("quantization is only supported for F16 and F32 models")
 					} else if want != ft {
-						fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantization)})
+						fn(api.ProgressResponse{
+							Status: "quantizing model tensors",
+						})

 						blob, err := GetBlobsPath(baseLayer.Digest)
 						if err != nil {
@@ -453,7 +456,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 						defer temp.Close()
 						defer os.Remove(temp.Name())

-						if err := llm.Quantize(blob, temp.Name(), want); err != nil {
+						if err := llm.Quantize(blob, temp.Name(), want, fn, tensorCount); err != nil {
 							return err
 						}

--- a/server/model.go
+++ b/server/model.go
@@ -98,6 +98,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
 	}
 	defer os.RemoveAll(p)

+	fn(api.ProgressResponse{Status: "converting model"})
 	// TODO(mxyng): this should write directly into a layer
 	// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
 	t, err := os.CreateTemp(p, "fp16")
@@ -122,18 +123,13 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
 		if baseModel == nil {
 			return nil, fmt.Errorf("no base model specified for the adapter")
 		}
-		fn(api.ProgressResponse{
-			Status: "converting adapter",
-		})
-		if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV(), fn); err != nil {
+
+		if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
 			return nil, err
 		}
 		layerType = "application/vnd.ollama.image.adapter"
 	case "model":
-		fn(api.ProgressResponse{
-			Status: "converting model",
-		})
-		if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t, fn); err != nil {
+		if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
 			return nil, err
 		}
 		layerType = "application/vnd.ollama.image.model"
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -145,7 +145,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to open file: %v", err)
 	}
 	defer file.Close()
-	if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}, func(api.ProgressResponse) {}); err != nil {
+	if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
 		t.Fatalf("failed to write gguf: %v", err)
 	}

@@ -197,7 +197,7 @@ func TestParseLayerFromCopy(t *testing.T) {
 	defer file2.Close()

 	for range 5 {
-		if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}, func(api.ProgressResponse) {}); err != nil {
+		if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
 			t.Fatalf("failed to write gguf: %v", err)
 		}
 	}
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -30,7 +30,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
 	}
 	defer f.Close()

-	if err := llm.WriteGGUF(f, kv, ti, func(api.ProgressResponse) {}); err != nil {
+	if err := llm.WriteGGUF(f, kv, ti); err != nil {
 		t.Fatal(err)
 	}

--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -128,8 +128,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	}, []llm.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	},
-		func(api.ProgressResponse) {}))
+	}))
 	require.NoError(t, err)

 	fname := f.Name()
Author	SHA1	Message	Date
Josh Yan	4da5d5beaa	lint	2024-08-28 10:23:41 -07:00
Josh Yan	cc17b02b23	update	2024-08-28 09:58:23 -07:00
Josh Yan	73d69bc90b	remove types	2024-08-27 16:45:07 -07:00
Josh Yan	9bc42f532b	rmv api type	2024-08-27 16:45:07 -07:00
Josh Yan	07c0f66f5e	rm print	2024-08-27 16:45:04 -07:00
Josh Yan	4a7bfca902	change progress msg	2024-08-27 16:44:38 -07:00
Josh Yan	04f2154505	fixed cgo	2024-08-27 16:44:38 -07:00
Josh Yan	de9b21b472	quantize progress	2024-08-27 16:44:32 -07:00