fixed pretokenizer

This commit is contained in:
nicole pardal
2025-12-09 10:02:17 -08:00
parent 57c1d7db9a
commit 03abdb4969

View File

@@ -58,7 +58,7 @@ func New(c fs.Config) (model.Model, error) {
var pretokenizers []string var pretokenizers []string
if c.String("tokenizer.ggml.pre") != "default" { if c.String("tokenizer.ggml.pre") != "default" {
pretokenizers = []string{ pretokenizers = []string{
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+`,
} }
} }
processor := model.NewBytePairEncoding(&vocabulary, pretokenizers...) processor := model.NewBytePairEncoding(&vocabulary, pretokenizers...)