fixed pretokenizer

This commit is contained in:
nicole pardal 2025-12-09 10:02:17 -08:00
parent 57c1d7db9a
commit 03abdb4969
1 changed files with 1 additions and 1 deletions

View File

@ -58,7 +58,7 @@ func New(c fs.Config) (model.Model, error) {
var pretokenizers []string
if c.String("tokenizer.ggml.pre") != "default" {
pretokenizers = []string{
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+`,
}
}
processor := model.NewBytePairEncoding(&vocabulary, pretokenizers...)