Merge pull request #11973 from ollama/drifkin/bpe

model: fix boundary in bpe
model: add bpe roundtripping tests
2025-08-19 22:58:33 -07:00 · 2025-08-19 22:05:48 -07:00 · 2025-08-19 18:34:49 -07:00 · 2025-08-19 12:36:28 -07:00
5 changed files with 60 additions and 21 deletions
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -378,9 +378,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
-		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
-		ctx.Forward(maskTensor.Copy(ctx, out))
-		maskTensor = out
+		maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
 	}

 	return maskTensor
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -396,6 +396,7 @@ type Tensor interface {

 	Shape() []int
 	DType() DType
+	Cast(ctx Context, dtype DType) Tensor

 	Bytes() []byte
 	Floats() []float32
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -843,23 +843,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		panic("set Input or Layer before creating tensors")
 	}

-	var cdtype uint32
-	switch dtype {
-	case ml.DTypeF32:
-		cdtype = C.GGML_TYPE_F32
-	case ml.DTypeF16:
-		cdtype = C.GGML_TYPE_F16
-	case ml.DTypeQ80:
-		cdtype = C.GGML_TYPE_Q8_0
-	case ml.DTypeQ40:
-		cdtype = C.GGML_TYPE_Q4_0
-	case ml.DTypeI32:
-		cdtype = C.GGML_TYPE_I32
-	case ml.DTypeMXFP4:
-		cdtype = C.GGML_TYPE_MXFP4
-	default:
-		panic("unsupported dtype")
-	}
+	cdtype := ggmlDType(dtype)

 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
@@ -1056,6 +1040,32 @@ func (t *Tensor) DType() ml.DType {
 	}
 }

+func ggmlDType(dtype ml.DType) uint32 {
+	switch dtype {
+	case ml.DTypeF32:
+		return C.GGML_TYPE_F32
+	case ml.DTypeF16:
+		return C.GGML_TYPE_F16
+	case ml.DTypeQ80:
+		return C.GGML_TYPE_Q8_0
+	case ml.DTypeQ40:
+		return C.GGML_TYPE_Q4_0
+	case ml.DTypeI32:
+		return C.GGML_TYPE_I32
+	case ml.DTypeMXFP4:
+		return C.GGML_TYPE_MXFP4
+	default:
+		panic("unsupported dtype")
+	}
+}
+
+func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
+	}
+}
+
 func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -109,7 +109,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 					r = 0x0143
 				case r <= 0x0020:
 					r = r + 0x0100
-				case r >= 0x007e && r <= 0x00a0:
+				case r >= 0x007f && r <= 0x00a0:
 					r = r + 0x00a2
 				}

--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
@@ -207,6 +207,36 @@ func TestLlama(t *testing.T) {
 			}
 		}
 	})
+
+	t.Run("roundtriping 0x00-0xFF", func(t *testing.T) {
+		t.Parallel()
+
+		for b := 0x00; b <= 0xFF; b++ {
+			input := string(rune(b))
+			ids, err := tokenizer.Encode(input, false)
+			if err != nil {
+				t.Errorf("failed to encode rune 0x%02X: %v", b, err)
+				continue
+			}
+
+			decoded, err := tokenizer.Decode(ids)
+			if err != nil {
+				t.Errorf("failed to decode rune 0x%02X: %v", b, err)
+				continue
+			}
+
+			if b == 0x00 {
+				if len(decoded) != 0 {
+					t.Errorf("Decode(Encode(0x00)) should be empty, got %v", ids)
+				}
+				continue
+			}
+
+			if decoded != input {
+				t.Errorf("rune 0x%02X failed roundtrip: got %q, want %q", b, decoded, input)
+			}
+		}
+	})
 }

 func BenchmarkBytePairEncoding(b *testing.B) {
Author	SHA1	Message	Date
Devon Rifkin	6de62664d9	Merge pull request #11973 from ollama/drifkin/bpe model: fix boundary in bpe	2025-08-19 22:58:33 -07:00
Devon Rifkin	463a6caad8	model: add bpe roundtripping tests	2025-08-19 22:05:48 -07:00
Devon Rifkin	fc5fb09f51	model: fix boundary in bpe 0x007e is a tilde and was getting adjusted (+0x00a2) to 0x0120 in the encode, but then in the decode it was getting adjusted down (-0x0100) to 0x0020. The boundary for the +0x00a2 case has been adjusted to fix this Fixes: #11966	2025-08-19 18:34:49 -07:00
Jesse Gross	05ccb17c6e	kvcache: Use Cast instead of Copy for flash attention masks Flash attention kernels require the mask of the KV cache be a F16 rather than an F32. We can use the GGML operation ggml_cast to do this rather than doing it ourselves, which allows reuse of a preallocated buffer in the graph rather than allocating a new one for each batch. This improves token generation performance with flash attention by 10-30% (with gpt-oss). This also makes performance with flash attention better than without it, as expected.	2025-08-19 12:36:28 -07:00