extras

sample: update tests and add test logits
sample: use partial sort for sorting
2025-03-12 18:28:59 +01:00 · 2025-03-12 00:55:18 -04:00 · 2025-03-12 00:46:12 -04:00 · 2025-03-12 00:45:41 -04:00
7 changed files with 120 additions and 79 deletions
--- a/README.md
+++ b/README.md
@@ -54,10 +54,6 @@ Here are some example models that can be downloaded:

 | Model              | Parameters | Size  | Download                         |
 | ------------------ | ---------- | ----- | -------------------------------- |
-| Gemma 3            | 1B         | 815MB | `ollama run gemma3:1b`           |
-| Gemma 3            | 4B         | 3.3GB | `ollama run gemma3`              |
-| Gemma 3            | 12B        | 8.1GB | `ollama run gemma3:12b`          |
-| Gemma 3            | 27B        | 17GB  | `ollama run gemma3:27b`          |
 | QwQ                | 32B        | 20GB  | `ollama run qwq`                 |
 | DeepSeek-R1        | 7B         | 4.7GB | `ollama run deepseek-r1`         |
 | DeepSeek-R1        | 671B       | 404GB | `ollama run deepseek-r1:671b`    |
@@ -70,6 +66,9 @@ Here are some example models that can be downloaded:
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
 | Phi 4              | 14B        | 9.1GB | `ollama run phi4`                |
 | Phi 4 Mini         | 3.8B       | 2.5GB | `ollama run phi4-mini`           |
+| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`             |
 | Moondream 2        | 1.4B       | 829MB | `ollama run moondream`           |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`         |
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -195,10 +195,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
-				if strings.Contains(err.Error(), "not found") {
-					fmt.Printf("error: %v\n", err)
-					continue
-				}
 				return err
 			}
 			continue
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -15,6 +15,7 @@ type TextOptions struct {
 	attnKeyLen, attnValLen           int
 	eps, ropeScale                   float32
 	ropeLocalBase, ropeGlobalBase    float32
+	finalLogitSoftcap                float32
 	largeModelScaling                bool
 }

@@ -56,15 +57,16 @@ func newTextModel(c ml.Config) *TextModel {
 		),
 		Layers: make([]TextLayer, numBlocks),
 		TextOptions: &TextOptions{
-			hiddenSize:     int(c.Uint("embedding_length")),
-			numHeads:       int(c.Uint("attention.head_count")),
-			numKVHeads:     int(c.Uint("attention.head_count_kv")),
-			attnKeyLen:     int(c.Uint("attention.key_length", 256)),
-			attnValLen:     int(c.Uint("attention.value_length", 256)),
-			eps:            c.Float("attention.layer_norm_rms_epsilon", 1e-06),
-			ropeLocalBase:  c.Float("rope.local.freq_base", 10000.0),
-			ropeGlobalBase: c.Float("rope.global.freq_base", 1000000.0),
-			ropeScale:      c.Float("rope.freq_scale", 1.0),
+			hiddenSize:        int(c.Uint("embedding_length")),
+			numHeads:          int(c.Uint("attention.head_count")),
+			numKVHeads:        int(c.Uint("attention.head_count_kv")),
+			attnKeyLen:        int(c.Uint("attention.key_length", 256)),
+			attnValLen:        int(c.Uint("attention.value_length", 256)),
+			eps:               c.Float("attention.layer_norm_rms_epsilon", 1e-06),
+			ropeLocalBase:     c.Float("rope.local.freq_base", 10000.0),
+			ropeGlobalBase:    c.Float("rope.global.freq_base", 1000000.0),
+			ropeScale:         c.Float("rope.freq_scale", 1.0),
+			finalLogitSoftcap: c.Float("final_logit_softcapping", 30.0),
 		},
 	}

@@ -243,5 +245,10 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
-	return m.Output.Forward(ctx, hiddenState)
+	hiddenState = m.Output.Forward(ctx, hiddenState)
+
+	// final logit softcap
+	hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.TextOptions.finalLogitSoftcap))
+	hiddenState = hiddenState.Tanh(ctx)
+	return hiddenState.Scale(ctx, float64(m.TextOptions.finalLogitSoftcap))
 }
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -116,9 +116,19 @@ func (i *Instance) Readline() (string, error) {

 			switch r {
 			case KeyUp:
-				i.historyPrev(buf, &currentLineBuf)
+				if i.History.Pos > 0 {
+					if i.History.Pos == i.History.Size() {
+						currentLineBuf = []rune(buf.String())
+					}
+					buf.Replace([]rune(i.History.Prev()))
+				}
 			case KeyDown:
-				i.historyNext(buf, &currentLineBuf)
+				if i.History.Pos < i.History.Size() {
+					buf.Replace([]rune(i.History.Next()))
+					if i.History.Pos == i.History.Size() {
+						buf.Replace(currentLineBuf)
+					}
+				}
 			case KeyLeft:
 				buf.MoveLeft()
 			case KeyRight:
@@ -175,10 +185,6 @@ func (i *Instance) Readline() (string, error) {
 			esc = true
 		case CharInterrupt:
 			return "", ErrInterrupt
-		case CharPrev:
-			i.historyPrev(buf, &currentLineBuf)
-		case CharNext:
-			i.historyNext(buf, &currentLineBuf)
 		case CharLineStart:
 			buf.MoveToStart()
 		case CharLineEnd:
@@ -240,24 +246,6 @@ func (i *Instance) HistoryDisable() {
 	i.History.Enabled = false
 }

-func (i *Instance) historyPrev(buf *Buffer, currentLineBuf *[]rune) {
-	if i.History.Pos > 0 {
-		if i.History.Pos == i.History.Size() {
-			*currentLineBuf = []rune(buf.String())
-		}
-		buf.Replace([]rune(i.History.Prev()))
-	}
-}
-
-func (i *Instance) historyNext(buf *Buffer, currentLineBuf *[]rune) {
-	if i.History.Pos < i.History.Size() {
-		buf.Replace([]rune(i.History.Next()))
-		if i.History.Pos == i.History.Size() {
-			buf.Replace(*currentLineBuf)
-		}
-	}
-}
-
 func NewTerminal() (*Terminal, error) {
 	fd := os.Stdin.Fd()
 	termios, err := SetRawMode(fd)
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -83,8 +83,11 @@ func (s *Sampler) sample(tokens []token) (token, error) {
 		return greedy(tokens), nil
 	}

-	// topK also sorts the tokens in descending order of logits
-	tokens = topK(tokens, s.topK)
+	if s.topK > 0 {
+		tokens = topK(tokens, s.topK)
+	} else {
+		sortLogits(tokens)
+	}

 	tokens = topP(tokens, s.topP)
 	tokens = minP(tokens, s.minP)
--- a/sample/transforms.go
+++ b/sample/transforms.go
@@ -11,7 +11,7 @@ import (
 type tokenHeap []token

 func (h tokenHeap) Len() int           { return len(h) }
-func (h tokenHeap) Less(i, j int) bool { return h[i].value < h[j].value }
+func (h tokenHeap) Less(i, j int) bool { return h[i].value < h[j].value } // Use < for min-heap to track largest elements
 func (h tokenHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }

 func (h *tokenHeap) Push(x any) {
@@ -28,17 +28,8 @@ func (h *tokenHeap) Pop() any {

 // topK limits the number of tokens considered to the k highest logits
 func topK(ts []token, k int) []token {
-	if k >= len(ts) || k <= 0 {
-		slices.SortFunc(ts, func(a, b token) int {
-			switch {
-			case a.value < b.value:
-				return 1
-			case a.value > b.value:
-				return -1
-			default:
-				return 0
-			}
-		})
+	if k >= len(ts) {
+		sortLogits(ts)
 		return ts
 	}

@@ -56,7 +47,7 @@ func topK(ts []token, k int) []token {
 	}

 	// Convert heap to sorted slice in descending order
-	result := make([]token, len(h))
+	result := make([]token, k)
 	for i := k - 1; i >= 0; i-- {
 		result[i] = heap.Pop(&h).(token)
 	}
@@ -110,6 +101,81 @@ func minP(ts []token, p float32) []token {
 	return ts
 }

+// partialSortLogits uses quickselect to efficiently find and sort the top n tokens
+func partialSortLogits(ts []token, n int) []token {
+	if n >= len(ts) {
+		n = len(ts)
+	}
+
+	left, right := 0, len(ts)-1
+	target := n - 1
+
+	// Quickselect algorithm to partition array around pivot
+	for left < right {
+		// Choose middle element as pivot and move it to the end
+		pivot := left + (right-left)/2
+		ts[pivot], ts[right] = ts[right], ts[pivot]
+
+		// storeIndex tracks where to put next element greater than pivot
+		storeIndex := left
+		pivotValue := ts[right].value
+
+		// Partition array into elements >= pivot and < pivot
+		// Elements >= pivot go to the left side
+		for i := left; i < right; i++ {
+			if ts[i].value >= pivotValue {
+				ts[storeIndex], ts[i] = ts[i], ts[storeIndex]
+				storeIndex++
+			}
+		}
+
+		// Move pivot to its final position
+		ts[right], ts[storeIndex] = ts[storeIndex], ts[right]
+
+		// If pivot is at target position, we're done
+		// Otherwise recursively partition the half containing target
+		if storeIndex == target {
+			break
+		} else if storeIndex < target {
+			left = storeIndex + 1 // Target is in right half
+		} else {
+			right = storeIndex - 1 // Target is in left half
+		}
+	}
+
+	// Sort just the top n elements in descending order
+	slices.SortFunc(ts[:n], func(a, b token) int {
+		if a.value > b.value {
+			return -1
+		}
+		if a.value < b.value {
+			return 1
+		}
+		return 0
+	})
+
+	return ts[:n]
+}
+
+// sortLogits uses partialSortLogits to efficiently sort tokens
+// It sorts approximately sqrt(len(tokens)) elements which balances
+// between having enough tokens for sampling while avoiding full sort
+func sortLogits(ts []token) {
+	// Use sqrt of token length as a heuristic for partial sort size
+	// This provides a good balance between performance and having enough tokens
+	n := int(math.Sqrt(float64(len(ts)))) + 1
+
+	// Ensure we have at least 100 tokens and at most 1000
+	switch {
+	case n < 100:
+		n = 100
+	case n > 1000:
+		n = 1000
+	}
+
+	partialSortLogits(ts, n)
+}
+
 func temperature(ts []token, temp float32) {
 	for i := range ts {
 		ts[i].value /= temp
--- a/sample/transforms_test.go
+++ b/sample/transforms_test.go
@@ -64,7 +64,7 @@ func TestTemperatureAndSoftmax(t *testing.T) {
 func TestTopK(t *testing.T) {
 	input := []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}

-	// Test k=5
+	// Test k=3
 	got := topK(toTokens(input), 5)
 	if len(got) != 5 {
 		t.Errorf("topK(5): wrong length: want 5, got %d", len(got))
@@ -77,24 +77,6 @@ func TestTopK(t *testing.T) {
 	if len(got) != len(input) {
 		t.Errorf("topK(20): wrong length: want %d, got %d", len(input), len(got))
 	}
-
-	// Test k=-1
-	input = []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}
-	want = []float32{0.27755088, 0.20409796, 0.15720603, 0.08582123, 0.045046154, 0.043722924, 0.036774673, 0.026986899, 0.01681367, 0.0046718004, 0.00412893, 0.0030491839}
-	got = topK(toTokens(input), -1)
-	if len(got) != len(input) {
-		t.Errorf("topK(-1): wrong length: want %d, got %d", len(input), len(got))
-	}
-	compareLogits(t, "topK(-1)", want, got)
-
-	// Test k=0
-	input = []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}
-	want = []float32{0.27755088, 0.20409796, 0.15720603, 0.08582123, 0.045046154, 0.043722924, 0.036774673, 0.026986899, 0.01681367, 0.0046718004, 0.00412893, 0.0030491839}
-	got = topK(toTokens(input), 0)
-	if len(got) != len(input) {
-		t.Errorf("topK(-1): wrong length: want %d, got %d", len(input), len(got))
-	}
-	compareLogits(t, "topK(-1)", want, got)
 }

 func TestTopP(t *testing.T) {
@@ -103,7 +85,7 @@ func TestTopP(t *testing.T) {

 	// First apply temperature and softmax to get probabilities
 	tokens = temperature(tokens, 1)
-	tokens = topK(tokens, 20)
+	sortLogits(tokens)

 	// Then apply topP
 	got := topP(tokens, 0.95)
@@ -135,7 +117,7 @@ func TestSortLogits(t *testing.T) {
 	input := []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}
 	tokens := toTokens(input)

-	tokens = topK(tokens, 20)
+	sortLogits(tokens)

 	for i := 1; i < len(tokens); i++ {
 		if tokens[i].value > tokens[i-1].value {
@@ -288,7 +270,7 @@ func BenchmarkTransforms(b *testing.B) {
 		b.ResetTimer()
 		for b.Loop() {
 			copy(tokensCopy, tokens)
-			topK(tokensCopy, 200000)
+			sortLogits(tokensCopy)
 		}
 	})
 }
Author	SHA1	Message	Date
jmorganca	9622b928b4	extras	2025-03-12 18:28:59 +01:00
ParthSareen	7fa6ea0da7	sample: update tests and add test logits	2025-03-12 00:55:18 -04:00
ParthSareen	310b235626	sample: use partial sort for sorting	2025-03-12 00:46:12 -04:00
ParthSareen	448fc4cd2a	sample: use container/heap for top_k	2025-03-12 00:45:41 -04:00