Compare commits

..

4 Commits

Author SHA1 Message Date
Devon Rifkin
6de62664d9 Merge pull request #11973 from ollama/drifkin/bpe
model: fix boundary in bpe
2025-08-19 22:58:33 -07:00
Devon Rifkin
463a6caad8 model: add bpe roundtripping tests 2025-08-19 22:05:48 -07:00
Devon Rifkin
fc5fb09f51 model: fix boundary in bpe
0x007e is a tilde and was getting adjusted (+0x00a2) to 0x0120 in the
encode, but then in the decode it was getting adjusted down (-0x0100) to
0x0020. The boundary for the +0x00a2 case has been adjusted to fix this

Fixes: #11966
2025-08-19 18:34:49 -07:00
Jesse Gross
05ccb17c6e kvcache: Use Cast instead of Copy for flash attention masks
Flash attention kernels require the mask of the KV cache be a F16
rather than an F32. We can use the GGML operation ggml_cast to do
this rather than doing it ourselves, which allows reuse of a
preallocated buffer in the graph rather than allocating a new one
for each batch. This improves token generation performance with
flash attention by 10-30% (with gpt-oss). This also makes performance
with flash attention better than without it, as expected.
2025-08-19 12:36:28 -07:00
5 changed files with 60 additions and 21 deletions

View File

@@ -378,9 +378,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
if c.config.MaskDType != ml.DTypeF32 {
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
ctx.Forward(maskTensor.Copy(ctx, out))
maskTensor = out
maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
}
return maskTensor

View File

@@ -396,6 +396,7 @@ type Tensor interface {
Shape() []int
DType() DType
Cast(ctx Context, dtype DType) Tensor
Bytes() []byte
Floats() []float32

View File

@@ -843,23 +843,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
panic("set Input or Layer before creating tensors")
}
var cdtype uint32
switch dtype {
case ml.DTypeF32:
cdtype = C.GGML_TYPE_F32
case ml.DTypeF16:
cdtype = C.GGML_TYPE_F16
case ml.DTypeQ80:
cdtype = C.GGML_TYPE_Q8_0
case ml.DTypeQ40:
cdtype = C.GGML_TYPE_Q4_0
case ml.DTypeI32:
cdtype = C.GGML_TYPE_I32
case ml.DTypeMXFP4:
cdtype = C.GGML_TYPE_MXFP4
default:
panic("unsupported dtype")
}
cdtype := ggmlDType(dtype)
if len(shape) < 1 || shape[0] == 0 {
var shape C.int64_t = 0
@@ -1056,6 +1040,32 @@ func (t *Tensor) DType() ml.DType {
}
}
func ggmlDType(dtype ml.DType) uint32 {
switch dtype {
case ml.DTypeF32:
return C.GGML_TYPE_F32
case ml.DTypeF16:
return C.GGML_TYPE_F16
case ml.DTypeQ80:
return C.GGML_TYPE_Q8_0
case ml.DTypeQ40:
return C.GGML_TYPE_Q4_0
case ml.DTypeI32:
return C.GGML_TYPE_I32
case ml.DTypeMXFP4:
return C.GGML_TYPE_MXFP4
default:
panic("unsupported dtype")
}
}
func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
}
}
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,

View File

@@ -109,7 +109,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
r = 0x0143
case r <= 0x0020:
r = r + 0x0100
case r >= 0x007e && r <= 0x00a0:
case r >= 0x007f && r <= 0x00a0:
r = r + 0x00a2
}

View File

@@ -207,6 +207,36 @@ func TestLlama(t *testing.T) {
}
}
})
t.Run("roundtriping 0x00-0xFF", func(t *testing.T) {
t.Parallel()
for b := 0x00; b <= 0xFF; b++ {
input := string(rune(b))
ids, err := tokenizer.Encode(input, false)
if err != nil {
t.Errorf("failed to encode rune 0x%02X: %v", b, err)
continue
}
decoded, err := tokenizer.Decode(ids)
if err != nil {
t.Errorf("failed to decode rune 0x%02X: %v", b, err)
continue
}
if b == 0x00 {
if len(decoded) != 0 {
t.Errorf("Decode(Encode(0x00)) should be empty, got %v", ids)
}
continue
}
if decoded != input {
t.Errorf("rune 0x%02X failed roundtrip: got %q, want %q", b, decoded, input)
}
}
})
}
func BenchmarkBytePairEncoding(b *testing.B) {