Compare commits

..

3 Commits

Author SHA1 Message Date
Patrick Devine
9a483dc7b7 refactor to use client.List instead of walking the filesystem 2024-01-26 18:34:42 -08:00
Patrick Devine
366b38460f fix linter 2024-01-26 18:34:42 -08:00
Patrick Devine
021b1bdc4a add --upgrade-all flag to refresh any stale models 2024-01-26 18:34:40 -08:00
25 changed files with 456 additions and 937 deletions

View File

@@ -200,21 +200,18 @@ brew install cmake go
```
Then generate dependencies:
```
go generate ./...
```
Then build the binary:
```
go build .
```
More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
### Running local builds
### Running local builds
Next, start the server:
```
@@ -256,7 +253,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
## Community Integrations
### Web & Desktop
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
@@ -269,7 +265,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd)
- [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
- [MindMac](https://mindmac.app)
### Terminal
@@ -282,7 +278,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [gptel Emacs client](https://github.com/karthink/gptel)
- [Oatmeal](https://github.com/dustinblackman/oatmeal)
- [cmdh](https://github.com/pgibler/cmdh)
- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
### Database
@@ -309,7 +304,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LangChainDart](https://github.com/davidmigloz/langchain_dart)
- [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
### Mobile
@@ -331,5 +326,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)

View File

@@ -183,11 +183,12 @@ type CopyRequest struct {
}
type PullRequest struct {
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"`
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"`
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
CurrentDigest string `json:"current_digest,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
@@ -241,6 +242,7 @@ type GenerateResponse struct {
type ModelDetails struct {
ParentModel string `json:"parent_model"`
Digest string `json:"digest"`
Format string `json:"format"`
Family string `json:"family"`
Families []string `json:"families"`
@@ -279,20 +281,85 @@ func (m *Metrics) Summary() {
var ErrInvalidOpts = fmt.Errorf("invalid options")
func (opts *Options) FromMap(m map[string]interface{}) error {
data, err := json.Marshal(m)
if err != nil {
return err
}
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
typeOpts := reflect.TypeOf(opts).Elem() // types of the fields in the options struct
err = json.Unmarshal(data, opts)
if err != nil {
// Custom error handling
if jsonErr, ok := err.(*json.UnmarshalTypeError); ok {
return fmt.Errorf("invalid type for option '%v': expected %v, got %v", jsonErr.Field, jsonErr.Type, jsonErr.Value)
// build map of json struct tags to their types
jsonOpts := make(map[string]reflect.StructField)
for _, field := range reflect.VisibleFields(typeOpts) {
jsonTag := strings.Split(field.Tag.Get("json"), ",")[0]
if jsonTag != "" {
jsonOpts[jsonTag] = field
}
return err
}
invalidOpts := []string{}
for key, val := range m {
if opt, ok := jsonOpts[key]; ok {
field := valueOpts.FieldByName(opt.Name)
if field.IsValid() && field.CanSet() {
if val == nil {
continue
}
switch field.Kind() {
case reflect.Int:
switch t := val.(type) {
case int64:
field.SetInt(t)
case float64:
// when JSON unmarshals numbers, it uses float64, not int
field.SetInt(int64(t))
default:
return fmt.Errorf("option %q must be of type integer", key)
}
case reflect.Bool:
val, ok := val.(bool)
if !ok {
return fmt.Errorf("option %q must be of type boolean", key)
}
field.SetBool(val)
case reflect.Float32:
// JSON unmarshals to float64
val, ok := val.(float64)
if !ok {
return fmt.Errorf("option %q must be of type float32", key)
}
field.SetFloat(val)
case reflect.String:
val, ok := val.(string)
if !ok {
return fmt.Errorf("option %q must be of type string", key)
}
field.SetString(val)
case reflect.Slice:
// JSON unmarshals to []interface{}, not []string
val, ok := val.([]interface{})
if !ok {
return fmt.Errorf("option %q must be of type array", key)
}
// convert []interface{} to []string
slice := make([]string, len(val))
for i, item := range val {
str, ok := item.(string)
if !ok {
return fmt.Errorf("option %q must be of an array of strings", key)
}
slice[i] = str
}
field.Set(reflect.ValueOf(slice))
default:
return fmt.Errorf("unknown type loading config params: %v", field.Kind())
}
}
} else {
invalidOpts = append(invalidOpts, key)
}
}
if len(invalidOpts) > 0 {
return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
}
return nil
}

View File

@@ -25,7 +25,6 @@ import (
"github.com/olekukonko/tablewriter"
"github.com/spf13/cobra"
"golang.org/x/crypto/ssh"
"golang.org/x/exp/slices"
"golang.org/x/term"
"github.com/jmorganca/ollama/api"
@@ -147,68 +146,19 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
name := args[0]
// check if the model exists on the server
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
_, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
var statusError api.StatusError
switch {
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
if err := PullHandler(cmd, []string{name}); err != nil {
return err
}
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
if err != nil {
return err
}
case err != nil:
return err
}
interactive := true
opts := runOptions{
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
MultiModal: slices.Contains(show.Details.Families, "clip"),
ParentModel: show.Details.ParentModel,
}
format, err := cmd.Flags().GetString("format")
if err != nil {
return err
}
opts.Format = format
prompts := args[1:]
// prepend stdin to the prompt if provided
if !term.IsTerminal(int(os.Stdin.Fd())) {
in, err := io.ReadAll(os.Stdin)
if err != nil {
return err
}
prompts = append([]string{string(in)}, prompts...)
opts.WordWrap = false
interactive = false
}
opts.Prompt = strings.Join(prompts, " ")
if len(prompts) > 0 {
interactive = false
}
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
opts.WordWrap = !nowrap
if !interactive {
return generate(cmd, opts)
}
return generateInteractive(cmd, opts)
return RunGenerate(cmd, args)
}
func PushHandler(cmd *cobra.Command, args []string) error {
@@ -407,6 +357,42 @@ func CopyHandler(cmd *cobra.Command, args []string) error {
}
func PullHandler(cmd *cobra.Command, args []string) error {
upgradeAll, err := cmd.Flags().GetBool("upgrade-all")
if err != nil {
return err
}
if !upgradeAll {
if len(args) == 0 {
return fmt.Errorf("no model specified to pull")
}
return pull(cmd, args[0], "")
}
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
models, err := client.List(cmd.Context())
if err != nil {
return err
}
for _, m := range (*models).Models {
err = pull(cmd, m.Name, "sha256:"+m.Digest)
if err != nil {
if strings.Contains(err.Error(), "file does not exist") {
fmt.Printf("model '%s' is no longer available\n", m.Name)
continue
}
return err
}
}
return nil
}
func pull(cmd *cobra.Command, name string, currentDigest string) error {
insecure, err := cmd.Flags().GetBool("insecure")
if err != nil {
return err
@@ -418,7 +404,7 @@ func PullHandler(cmd *cobra.Command, args []string) error {
}
p := progress.NewProgress(os.Stderr)
defer p.Stop()
defer p.StopWithoutClear()
bars := make(map[string]*progress.Bar)
@@ -452,7 +438,7 @@ func PullHandler(cmd *cobra.Command, args []string) error {
return nil
}
request := api.PullRequest{Name: args[0], Insecure: insecure}
request := api.PullRequest{Name: name, Insecure: insecure, CurrentDigest: currentDigest}
if err := client.Pull(cmd.Context(), &request, fn); err != nil {
return err
}
@@ -460,6 +446,51 @@ func PullHandler(cmd *cobra.Command, args []string) error {
return nil
}
func RunGenerate(cmd *cobra.Command, args []string) error {
interactive := true
opts := runOptions{
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
}
format, err := cmd.Flags().GetString("format")
if err != nil {
return err
}
opts.Format = format
prompts := args[1:]
// prepend stdin to the prompt if provided
if !term.IsTerminal(int(os.Stdin.Fd())) {
in, err := io.ReadAll(os.Stdin)
if err != nil {
return err
}
prompts = append([]string{string(in)}, prompts...)
opts.WordWrap = false
interactive = false
}
opts.Prompt = strings.Join(prompts, " ")
if len(prompts) > 0 {
interactive = false
}
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
opts.WordWrap = !nowrap
if !interactive {
return generate(cmd, opts)
}
return generateInteractive(cmd, opts)
}
type generateContextKey string
type runOptions struct {
@@ -635,18 +666,10 @@ func generate(cmd *cobra.Command, opts runOptions) error {
return nil
}
if opts.MultiModal {
opts.Prompt, opts.Images, err = extractFileData(opts.Prompt)
if err != nil {
return err
}
}
request := api.GenerateRequest{
Model: opts.Model,
Prompt: opts.Prompt,
Context: generateContext,
Images: opts.Images,
Format: opts.Format,
System: opts.System,
Template: opts.Template,
@@ -897,12 +920,13 @@ func NewCLI() *cobra.Command {
pullCmd := &cobra.Command{
Use: "pull MODEL",
Short: "Pull a model from a registry",
Args: cobra.ExactArgs(1),
Args: cobra.RangeArgs(0, 1),
PreRunE: checkServerHeartbeat,
RunE: PullHandler,
}
pullCmd.Flags().Bool("insecure", false, "Use an insecure registry")
pullCmd.Flags().Bool("upgrade-all", false, "Upgrade all models if they're out of date")
pushCmd := &cobra.Command{
Use: "push MODEL",

View File

@@ -6,7 +6,6 @@ import (
"io"
"net/http"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
@@ -99,11 +98,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
if opts.MultiModal {
fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
}
fmt.Fprintln(os.Stderr, "")
}
@@ -213,7 +207,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
switch multiline {
case MultilineSystem:
opts.System = sb.String()
opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
fmt.Println("Set system message.")
sb.Reset()
case MultilineTemplate:
@@ -233,6 +226,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
fmt.Fprintln(&sb)
multiline = MultilinePrompt
scanner.Prompt.UseAlt = true
break
}
case scanner.Pasting:
fmt.Fprintln(&sb, line)
@@ -355,13 +349,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
if args[1] == "system" {
opts.System = sb.String()
opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
fmt.Println("Set system message.")
sb.Reset()
} else if args[1] == "template" {
opts.Template = sb.String()
fmt.Println("Set prompt template.")
sb.Reset()
}
sb.Reset()
@@ -496,18 +487,29 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
if err != nil {
return err
}
newMessage.Content = msg
// clear all previous images for better responses
// reset the context if we find another image
if len(images) > 0 {
for i := range opts.Messages {
opts.Messages[i].Images = nil
newMessage.Images = append(newMessage.Images, images...)
// reset the context for the new image
opts.Messages = []api.Message{}
} else {
if len(opts.Messages) > 1 {
newMessage.Images = append(newMessage.Images, opts.Messages[len(opts.Messages)-2].Images...)
}
}
newMessage.Content = msg
newMessage.Images = images
if len(newMessage.Images) == 0 {
fmt.Println("This model requires you to add a jpeg, png, or svg image.")
fmt.Println()
sb.Reset()
continue
}
}
if opts.System != "" {
opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
}
opts.Messages = append(opts.Messages, newMessage)
assistant, err := chat(cmd, opts)
@@ -601,10 +603,10 @@ func extractFileData(input string) (string, []api.ImageData, error) {
if os.IsNotExist(err) {
continue
}
fmt.Fprintf(os.Stderr, "Couldn't process image: %q\n", err)
fmt.Printf("Couldn't process image: %q\n", err)
return "", imgs, err
}
fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
fmt.Printf("Added image '%s'\n", nfp)
input = strings.ReplaceAll(input, fp, "")
imgs = append(imgs, data)
}

View File

@@ -542,7 +542,7 @@ curl http://localhost:11434/api/chat -d '{
"role": "user",
"content": "what is in this image?",
"images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
}
},
]
}'
```

View File

@@ -50,8 +50,7 @@ development and runtime packages.
Typically the build scripts will auto-detect CUDA, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
libraries, and `CUDACXX` to the location of the nvcc compiler.
Then generate dependencies:

View File

@@ -15,7 +15,7 @@ FROM ./mistral-7b-v0.1.Q4_0.gguf
(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
```
FROM ./mistral-7b-v0.1.Q4_0.gguf
FROM ./q4_0.bin
TEMPLATE "[INST] {{ .Prompt }} [/INST]"
```
@@ -37,69 +37,55 @@ ollama run example "What is your favourite condiment?"
## Importing (PyTorch & Safetensors)
> Importing from PyTorch and Safetensors is a longer process than importing from GGUF. Improvements that make it easier are a work in progress.
### Supported models
### Setup
Ollama supports a set of model architectures, with support for more coming soon:
First, clone the `ollama/ollama` repo:
- Llama & Mistral
- Falcon & RW
- BigCode
```
git clone git@github.com:ollama/ollama.git ollama
cd ollama
```
To view a model's architecture, check the `config.json` file in its HuggingFace repo. You should see an entry under `architectures` (e.g. `LlamaForCausalLM`).
and then fetch its `llama.cpp` submodule:
```shell
git submodule init
git submodule update llm/llama.cpp
```
Next, install the Python dependencies:
```
python3 -m venv llm/llama.cpp/.venv
source llm/llama.cpp/.venv/bin/activate
pip install -r llm/llama.cpp/requirements.txt
```
Then build the `quantize` tool:
```
make -C llm/llama.cpp quantize
```
### Clone the HuggingFace repository (optional)
### Step 1: Clone the HuggingFace repository (optional)
If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model.
Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage), verify it's installed, and then clone the model's repository:
```
git lfs install
git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 model
git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
cd Mistral-7B-Instruct-v0.1
```
### Convert the model
### Step 2: Convert and quantize to a `.bin` file (optional, for PyTorch and Safetensors)
> Note: some model architectures require using specific convert scripts. For example, Qwen models require running `convert-hf-to-gguf.py` instead of `convert.py`
If the model is in PyTorch or Safetensors format, a [Docker image](https://hub.docker.com/r/ollama/quantize) with the tooling required to convert and quantize models is available.
First, Install [Docker](https://www.docker.com/get-started/).
Next, to convert and quantize your model, run:
```
python llm/llama.cpp/convert.py ./model --outtype f16 --outfile converted.bin
docker run --rm -v .:/model ollama/quantize -q q4_0 /model
```
### Quantize the model
This will output two files into the directory:
```
llm/llama.cpp/quantize converted.bin quantized.bin q4_0
```
- `f16.bin`: the model converted to GGUF
- `q4_0.bin` the model quantized to a 4-bit quantization (Ollama will use this file to create the Ollama model)
### Step 3: Write a `Modelfile`
Next, create a `Modelfile` for your model:
```
FROM quantized.bin
FROM ./q4_0.bin
```
(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
```
FROM ./q4_0.bin
TEMPLATE "[INST] {{ .Prompt }} [/INST]"
```
@@ -163,3 +149,47 @@ The quantization options are as follow (from highest highest to lowest levels of
- `q6_K`
- `q8_0`
- `f16`
## Manually converting & quantizing models
### Prerequisites
Start by cloning the `llama.cpp` repo to your machine in another directory:
```
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
```
Next, install the Python dependencies:
```
pip install -r requirements.txt
```
Finally, build the `quantize` tool:
```
make quantize
```
### Convert the model
Run the correct conversion script for your model architecture:
```shell
# LlamaForCausalLM or MistralForCausalLM
python convert.py <path to model directory>
# FalconForCausalLM
python convert-falcon-hf-to-gguf.py <path to model directory>
# GPTBigCodeForCausalLM
python convert-starcoder-hf-to-gguf.py <path to model directory>
```
### Quantize the model
```
quantize <path to model dir>/ggml-model-f32.bin <path to model dir>/q4_0.bin q4_0
```

View File

@@ -12,13 +12,6 @@ On Linux systems with systemd, the logs can be found with this command:
journalctl -u ollama
```
When you run Ollama in a container, the logs go to stdout/stderr in the container:
```shell
docker logs <container-name>
```
(Use `docker ps` to find the container name)
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.

View File

@@ -30,8 +30,8 @@ type handles struct {
var gpuMutex sync.Mutex
var gpuHandles *handles = nil
// With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0}
// With our current CUDA compile flags, 5.2 and older will not work properly
const CudaComputeMajorMin = 6
// Possible locations for the nvidia-ml library
var CudaLinuxGlobs = []string{
@@ -122,34 +122,34 @@ func GetGPUInfo() GpuInfo {
initGPUHandles()
}
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
// All our GPU builds have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant := GetCPUVariant()
if cpuVariant == "" && runtime.GOARCH == "amd64" {
if cpuVariant == "" {
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
}
var memInfo C.mem_info_t
resp := GpuInfo{}
if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
if gpuHandles.cuda != nil && cpuVariant != "" {
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.count > 0 {
} else {
// Verify minimum compute capability
var cc C.cuda_compute_capability_t
C.cuda_compute_capability(*gpuHandles.cuda, &cc)
if cc.err != nil {
slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
C.free(unsafe.Pointer(cc.err))
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
} else if cc.major >= CudaComputeMajorMin {
slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
} else {
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
}
} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
} else if gpuHandles.rocm != nil && cpuVariant != "" {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
@@ -157,7 +157,7 @@ func GetGPUInfo() GpuInfo {
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog.Info("ROCm unsupported integrated GPU detected")
} else if memInfo.count > 0 {
} else {
if memInfo.igpu_index >= 0 {
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it

View File

@@ -178,7 +178,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
const int buflen = 256;
char buf[buflen + 1];
if (h.handle == NULL) {
resp->str = strdup("rocm handle not initialized");
resp->str = strdup("nvml handle not initialized");
resp->status = 1;
return;
}
@@ -195,4 +195,4 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
resp->str = strdup(buf);
}
#endif // __APPLE__
#endif // __APPLE__

View File

@@ -4,7 +4,7 @@ package llm
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
@@ -161,10 +161,13 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var imageData []ImageData
if len(predict.Images) > 0 {
slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
for cnt, i := range predict.Images {
imageData = append(imageData, ImageData{Data: i, ID: cnt})
}
}
slog.Info(fmt.Sprintf("loaded %d images", len(imageData)))
request := map[string]any{
"prompt": predict.Prompt,
@@ -186,7 +189,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
"penalize_nl": predict.Options.PenalizeNewline,
"seed": predict.Options.Seed,
"stop": predict.Options.Stop,
"image_data": predict.Images,
"image_data": imageData,
"cache_prompt": true,
}

View File

@@ -26,13 +26,13 @@
// Expose the llama server as a callable extern "C" API
llama_server_context *llama = NULL;
std::atomic<bool> ext_server_running(false);
std::thread ext_server_thread;
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
assert(err != NULL && sparams != NULL);
log_set_target(stderr);
if (!sparams->verbose_logging) {
server_verbose = true;
log_disable();
}
@@ -122,23 +122,18 @@ void llama_server_start() {
assert(llama != NULL);
// TODO mutex to protect thread creation
ext_server_thread = std::thread([&]() {
ext_server_running = true;
try {
LOG_TEE("llama server main loop starting\n");
ggml_time_init();
llama->queue_tasks.on_new_task(std::bind(
&llama_server_context::process_single_task, llama, std::placeholders::_1));
llama->queue_tasks.on_finish_multitask(std::bind(
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_all_tasks_finished(std::bind(
&llama_server_context::run_on_all_tasks_finished, llama));
llama->queue_results.on_multitask_update(std::bind(
&llama_server_queue::update_multitask,
&llama->queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3
));
llama->queue_tasks.start_loop();
while (ext_server_running.load()) {
if (!llama->update_slots()) {
LOG_TEE(
"unexpected error in llama server update_slots - exiting main "
"loop\n");
break;
}
}
} catch (std::exception &e) {
LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
} catch (...) {
@@ -151,10 +146,13 @@ void llama_server_start() {
void llama_server_stop() {
assert(llama != NULL);
LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
// This may take a while for any pending tasks to drain
// TODO - consider a timeout to cancel tasks if it's taking too long
llama->queue_tasks.terminate();
// TODO - too verbose, remove once things are solid
LOG_TEE("requesting llama server shutdown\n");
ext_server_running = false;
// unblocks the update_slots() loop so it can clean up and exit
llama->request_cancel(0);
ext_server_thread.join();
delete llama;
llama = NULL;
@@ -167,9 +165,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
resp->msg[0] = '\0';
try {
json data = json::parse(json_req);
resp->id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(resp->id);
llama->request_completion(resp->id, data, false, false, -1);
resp->id = llama->request_completion(data, false, false, -1);
} catch (std::exception &e) {
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
} catch (...) {
@@ -187,22 +183,16 @@ void llama_server_completion_next_result(const int task_id,
resp->json_resp = NULL;
std::string result_json;
try {
task_result result = llama->queue_results.recv(task_id);
task_result result = llama->next_result(task_id);
result_json =
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
resp->id = result.id;
resp->stop = result.stop;
resp->error = result.error;
if (result.error) {
LOG_TEE("next result cancel on error\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (result.stop) {
LOG_TEE("next result cancel on stop\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting task ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
}
} catch (std::exception &e) {
resp->error = true;
@@ -233,7 +223,6 @@ void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
err->msg[0] = '\0';
try {
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
@@ -318,15 +307,13 @@ void llama_server_embedding(const char *json_req, char **json_resp,
} else {
prompt = "";
}
const int task_id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(task_id);
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
task_result result = llama->queue_results.recv(task_id);
const int task_id = llama->request_completion(
{{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
task_result result = llama->next_result(task_id);
std::string result_json = result.result_json.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());

View File

@@ -39,9 +39,6 @@ init_vars() {
*)
;;
esac
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
fi
}
git_module_setup() {
@@ -65,17 +62,15 @@ apply_patches() {
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
fi
if [ -n "$(ls -A ../patches/*.diff)" ]; then
# apply temporary patches until fix is upstream
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
# apply temporary patches until fix is upstream
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
for patch in ../patches/*.diff; do
(cd ${LLAMACPP_DIR} && git apply ${patch})
done
fi
done
for patch in ../patches/*.diff; do
(cd ${LLAMACPP_DIR} && git apply ${patch})
done
# Avoid duplicate main symbols when we link into the cgo binary
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
@@ -114,12 +109,4 @@ compress_libs() {
# Keep the local tree clean after we're done with the build
cleanup() {
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
if [ -n "$(ls -A ../patches/*.diff)" ]; then
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
done
fi
}

View File

@@ -128,11 +128,6 @@ if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
fi
# Allow override in case libcudart is in the wrong place
if [ -z "${CUDART_LIB_DIR}" ]; then
CUDART_LIB_DIR="${CUDA_LIB_DIR}"
fi
if [ -d "${CUDA_LIB_DIR}" ]; then
echo "CUDA libraries detected - building dynamic CUDA library"
init_vars
@@ -140,7 +135,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
if [ -n "${CUDA_MAJOR}" ]; then
CUDA_VARIANT=_v${CUDA_MAJOR}
fi
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
build
@@ -156,8 +151,6 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
else
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
fi

View File

@@ -25,11 +25,6 @@ function init_vars {
}
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
} else {
$script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
}
}
function git_module_setup {
@@ -156,7 +151,7 @@ if ($null -ne $script:CUDA_LIB_DIR) {
}
init_vars
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
build
install
cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"

View File

@@ -62,7 +62,7 @@ const maxRetries = 3
type PredictOpts struct {
Prompt string
Format string
Images []ImageData
Images []api.ImageData
Options api.Options
}

View File

@@ -1,8 +1,8 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a48582ad..9fffffd8 100644
index 0462fbd2..4fa7b57f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1564,12 +1564,6 @@ struct llama_server_context
@@ -1857,12 +1857,6 @@ struct llama_server_context
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
}
@@ -15,8 +15,8 @@ index a48582ad..9fffffd8 100644
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
{
// we have to evaluate at least 1 token to generate logits.
@@ -1581,6 +1575,12 @@ struct llama_server_context
}
@@ -1870,6 +1864,12 @@ struct llama_server_context
slot.n_past--;
}
+ LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);

View File

@@ -1,90 +0,0 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 11dd82c3..311495a8 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -28,6 +28,7 @@
#include <chrono>
#include <condition_variable>
#include <atomic>
+#include <signal.h>
using json = nlohmann::json;
@@ -2394,6 +2395,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
+std::function<void(int)> shutdown_handler;
+inline void signal_handler(int signal) { shutdown_handler(signal); }
+
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@@ -3014,8 +3018,14 @@ int main(int argc, char **argv)
std::placeholders::_2,
std::placeholders::_3
));
- llama.queue_tasks.start_loop();
+ shutdown_handler = [&](int) {
+ llama.queue_tasks.terminate();
+ };
+ signal(SIGTERM, signal_handler);
+ signal(SIGINT, signal_handler);
+ llama.queue_tasks.start_loop();
+ svr.stop();
t.join();
llama_backend_free();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 70cce072..2acb1eab 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -6,6 +6,7 @@
#include <mutex>
#include <condition_variable>
#include <unordered_map>
+#include <atomic>
#include "json.hpp"
@@ -190,6 +191,7 @@ inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
+ std::atomic<bool> running;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
@@ -248,9 +250,15 @@ struct llama_server_queue {
queue_tasks_deferred.clear();
}
- // Start the main loop. This call is blocking
- [[noreturn]]
+ // end the start_loop routine
+ void terminate() {
+ running = false;
+ condition_tasks.notify_all();
+ }
+
+ // Start the main loop.
void start_loop() {
+ running = true;
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
@@ -294,8 +302,12 @@ struct llama_server_queue {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
+ if (!running.load()) {
+ LOG_VERBOSE("ending start_loop", {});
+ return;
+ }
condition_tasks.wait(lock, [&]{
- return !queue_tasks.empty();
+ return (!queue_tasks.empty() || !running.load());
});
}
}

View File

@@ -52,6 +52,10 @@ func (p *Progress) Stop() bool {
return stopped
}
func (p *Progress) StopWithoutClear() bool {
return p.stop()
}
func (p *Progress) StopAndClear() bool {
fmt.Fprint(p.w, "\033[?25l")
defer fmt.Fprint(p.w, "\033[?25h")

View File

@@ -147,7 +147,12 @@ func (s SignatureData) Bytes() []byte {
// SignData takes a SignatureData object and signs it with a raw private key
func (s SignatureData) Sign(rawKey []byte) (string, error) {
signer, err := ssh.ParsePrivateKey(rawKey)
privateKey, err := ssh.ParseRawPrivateKey(rawKey)
if err != nil {
return "", err
}
signer, err := ssh.NewSignerFromKey(privateKey)
if err != nil {
return "", err
}

View File

@@ -63,7 +63,6 @@ type PromptVars struct {
Prompt string
Response string
First bool
Images []llm.ImageData
}
// extractParts extracts the parts of the template before and after the {{.Response}} node.
@@ -120,6 +119,10 @@ func Prompt(promptTemplate string, p PromptVars) (string, error) {
// PreResponsePrompt returns the prompt before the response tag
func (m *Model) PreResponsePrompt(p PromptVars) (string, error) {
if p.System == "" {
// use the default system prompt for this model if one is not specified
p.System = m.System
}
pre, _, err := extractParts(m.Template)
if err != nil {
return "", err
@@ -147,68 +150,62 @@ func (m *Model) PostResponseTemplate(p PromptVars) (string, error) {
return Prompt(post, p)
}
type ChatHistory struct {
Prompts []PromptVars
LastSystem string
}
// ChatPrompts returns a list of formatted chat prompts from a list of messages
func (m *Model) ChatPrompts(msgs []api.Message) (*ChatHistory, error) {
func (m *Model) ChatPrompt(msgs []api.Message) (string, []api.ImageData, error) {
// build the prompt from the list of messages
lastSystem := m.System
var prompt strings.Builder
var currentImages []api.ImageData
currentVars := PromptVars{
First: true,
System: m.System,
}
prompts := []PromptVars{}
var images []llm.ImageData
writePrompt := func() error {
p, err := Prompt(m.Template, currentVars)
if err != nil {
return err
}
prompt.WriteString(p)
currentVars = PromptVars{}
return nil
}
for _, msg := range msgs {
switch strings.ToLower(msg.Role) {
case "system":
// if this is the first message it overrides the system prompt in the modelfile
if !currentVars.First && currentVars.System != "" {
prompts = append(prompts, currentVars)
currentVars = PromptVars{}
if currentVars.System != "" {
if err := writePrompt(); err != nil {
return "", nil, err
}
}
currentVars.System = msg.Content
lastSystem = msg.Content
case "user":
if currentVars.Prompt != "" {
prompts = append(prompts, currentVars)
currentVars = PromptVars{}
if err := writePrompt(); err != nil {
return "", nil, err
}
}
currentVars.Prompt = msg.Content
for i := range msg.Images {
id := len(images) + i
currentVars.Prompt += fmt.Sprintf(" [img-%d]", id)
currentVars.Images = append(currentVars.Images, llm.ImageData{
ID: id,
Data: msg.Images[i],
})
}
images = append(images, currentVars.Images...)
currentImages = msg.Images
case "assistant":
currentVars.Response = msg.Content
prompts = append(prompts, currentVars)
currentVars = PromptVars{}
if err := writePrompt(); err != nil {
return "", nil, err
}
default:
return nil, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
return "", nil, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
}
}
// Append the last set of vars if they are non-empty
if currentVars.Prompt != "" || currentVars.System != "" {
prompts = append(prompts, currentVars)
p, err := m.PreResponsePrompt(currentVars)
if err != nil {
return "", nil, fmt.Errorf("pre-response template: %w", err)
}
prompt.WriteString(p)
}
return &ChatHistory{
Prompts: prompts,
LastSystem: lastSystem,
}, nil
return prompt.String(), currentImages, nil
}
type ManifestV2 struct {
@@ -474,7 +471,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
switch {
case errors.Is(err, os.ErrNotExist):
fn(api.ProgressResponse{Status: "pulling model"})
if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil {
if err := PullModel(ctx, c.Args, "", &RegistryOptions{}, fn); err != nil {
return err
}
@@ -1044,7 +1041,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
return nil
}
func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
func PullModel(ctx context.Context, name, currentDigest string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
mp := ParseModelPath(name)
var manifest *ManifestV2
@@ -1072,13 +1069,23 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
return fmt.Errorf("insecure protocol http")
}
fn(api.ProgressResponse{Status: "pulling manifest"})
if currentDigest == "" {
fn(api.ProgressResponse{Status: "pulling manifest"})
}
manifest, err = pullModelManifest(ctx, mp, regOpts)
manifest, err = pullModelManifest(ctx, mp, currentDigest, regOpts)
if err != nil {
return fmt.Errorf("pull model manifest: %s", err)
}
if currentDigest != "" {
if manifest == nil {
// we already have the model
return nil
}
fn(api.ProgressResponse{Status: "upgrading " + mp.GetShortTagname()})
}
var layers []*Layer
layers = append(layers, manifest.Layers...)
layers = append(layers, manifest.Config)
@@ -1150,17 +1157,27 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
return nil
}
func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *RegistryOptions) (*ManifestV2, error) {
func pullModelManifest(ctx context.Context, mp ModelPath, currentDigest string, regOpts *RegistryOptions) (*ManifestV2, error) {
requestURL := mp.BaseURL().JoinPath("v2", mp.GetNamespaceRepository(), "manifests", mp.Tag)
headers := make(http.Header)
headers.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json")
if currentDigest != "" {
headers.Set("If-None-Match", currentDigest)
}
resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, regOpts)
if err != nil {
return nil, err
}
defer resp.Body.Close()
// todo we can potentially read the manifest locally and return it here
if resp.StatusCode == http.StatusNotModified {
return nil, nil
}
var m *ManifestV2
if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
return nil, err

View File

@@ -1,7 +1,6 @@
package server
import (
"bytes"
"strings"
"testing"
@@ -234,58 +233,17 @@ func TestModel_PreResponsePrompt_PostResponsePrompt(t *testing.T) {
}
}
func chatHistoryEqual(a, b ChatHistory) bool {
if len(a.Prompts) != len(b.Prompts) {
return false
}
for i, v := range a.Prompts {
if v.First != b.Prompts[i].First {
return false
}
if v.Response != b.Prompts[i].Response {
return false
}
if v.Prompt != b.Prompts[i].Prompt {
return false
}
if v.System != b.Prompts[i].System {
return false
}
if len(v.Images) != len(b.Prompts[i].Images) {
return false
}
for j, img := range v.Images {
if img.ID != b.Prompts[i].Images[j].ID {
return false
}
if !bytes.Equal(img.Data, b.Prompts[i].Images[j].Data) {
return false
}
}
}
return a.LastSystem == b.LastSystem
}
func TestChat(t *testing.T) {
tests := []struct {
name string
model Model
msgs []api.Message
want ChatHistory
wantErr string
name string
template string
msgs []api.Message
want string
wantErr string
}{
{
name: "Single Message",
model: Model{
Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
},
name: "Single Message",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
msgs: []api.Message{
{
Role: "system",
@@ -296,22 +254,34 @@ func TestChat(t *testing.T) {
Content: "What are the potion ingredients?",
},
},
want: ChatHistory{
Prompts: []PromptVars{
{
System: "You are a Wizard.",
Prompt: "What are the potion ingredients?",
First: true,
},
},
LastSystem: "You are a Wizard.",
},
want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
},
{
name: "Message History",
model: Model{
Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
name: "First Message",
template: "[INST] {{if .First}}Hello!{{end}} {{ .System }} {{ .Prompt }} [/INST]",
msgs: []api.Message{
{
Role: "system",
Content: "You are a Wizard.",
},
{
Role: "user",
Content: "What are the potion ingredients?",
},
{
Role: "assistant",
Content: "eye of newt",
},
{
Role: "user",
Content: "Anything else?",
},
},
want: "[INST] Hello! You are a Wizard. What are the potion ingredients? [/INST]eye of newt[INST] Anything else? [/INST]",
},
{
name: "Message History",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
msgs: []api.Message{
{
Role: "system",
@@ -330,85 +300,18 @@ func TestChat(t *testing.T) {
Content: "Anything else?",
},
},
want: ChatHistory{
Prompts: []PromptVars{
{
System: "You are a Wizard.",
Prompt: "What are the potion ingredients?",
Response: "sugar",
First: true,
},
{
Prompt: "Anything else?",
},
},
LastSystem: "You are a Wizard.",
},
want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]sugar[INST] Anything else? [/INST]",
},
{
name: "Assistant Only",
model: Model{
Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
},
name: "Assistant Only",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
msgs: []api.Message{
{
Role: "assistant",
Content: "everything nice",
},
},
want: ChatHistory{
Prompts: []PromptVars{
{
Response: "everything nice",
First: true,
},
},
},
},
{
name: "Last system message is preserved from modelfile",
model: Model{
Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
System: "You are Mojo Jojo.",
},
msgs: []api.Message{
{
Role: "user",
Content: "hi",
},
},
want: ChatHistory{
Prompts: []PromptVars{
{
System: "You are Mojo Jojo.",
Prompt: "hi",
First: true,
},
},
LastSystem: "You are Mojo Jojo.",
},
},
{
name: "Last system message is preserved from messages",
model: Model{
Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
System: "You are Mojo Jojo.",
},
msgs: []api.Message{
{
Role: "system",
Content: "You are Professor Utonium.",
},
},
want: ChatHistory{
Prompts: []PromptVars{
{
System: "You are Professor Utonium.",
First: true,
},
},
LastSystem: "You are Professor Utonium.",
},
want: "[INST] [/INST]everything nice",
},
{
name: "Invalid Role",
@@ -423,8 +326,11 @@ func TestChat(t *testing.T) {
}
for _, tt := range tests {
m := Model{
Template: tt.template,
}
t.Run(tt.name, func(t *testing.T) {
got, err := tt.model.ChatPrompts(tt.msgs)
got, _, err := m.ChatPrompt(tt.msgs)
if tt.wantErr != "" {
if err == nil {
t.Errorf("ChatPrompt() expected error, got nil")
@@ -432,10 +338,9 @@ func TestChat(t *testing.T) {
if !strings.Contains(err.Error(), tt.wantErr) {
t.Errorf("ChatPrompt() error = %v, wantErr %v", err, tt.wantErr)
}
return
}
if !chatHistoryEqual(*got, tt.want) {
t.Errorf("ChatPrompt() got = %#v, want %#v", got, tt.want)
if got != tt.want {
t.Errorf("ChatPrompt() got = %v, want %v", got, tt.want)
}
})
}

View File

@@ -178,7 +178,11 @@ func GenerateHandler(c *gin.Context) {
opts, err := modelOptions(model, req.Options)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
if errors.Is(err, api.ErrInvalidOpts) {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -235,15 +239,6 @@ func GenerateHandler(c *gin.Context) {
Prompt: req.Prompt,
First: len(req.Context) == 0,
}
if promptVars.System == "" {
promptVars.System = model.System
}
for i := range req.Images {
promptVars.Prompt += fmt.Sprintf(" [img-%d]", i)
}
p, err := model.PreResponsePrompt(promptVars)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -253,8 +248,6 @@ func GenerateHandler(c *gin.Context) {
prompt = rebuild.String()
}
slog.Debug("generate handler", "prompt", prompt)
ch := make(chan any)
var generated strings.Builder
go func() {
@@ -308,19 +301,11 @@ func GenerateHandler(c *gin.Context) {
ch <- resp
}
var images []llm.ImageData
for i := range req.Images {
images = append(images, llm.ImageData{
ID: i,
Data: req.Images[i],
})
}
// Start prediction
predictReq := llm.PredictOpts{
Prompt: prompt,
Format: req.Format,
Images: images,
Images: req.Images,
Options: opts,
}
if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
@@ -392,7 +377,11 @@ func EmbeddingHandler(c *gin.Context) {
opts, err := modelOptions(model, req.Options)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
if errors.Is(err, api.ErrInvalidOpts) {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -462,7 +451,7 @@ func PullModelHandler(c *gin.Context) {
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
if err := PullModel(ctx, model, regOpts, fn); err != nil {
if err := PullModel(ctx, model, req.CurrentDigest, regOpts, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
}()
@@ -684,6 +673,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
modelDetails := api.ModelDetails{
ParentModel: model.ParentModel,
Digest: "sha256:" + model.Digest,
Format: model.Config.ModelFormat,
Family: model.Config.ModelFamily,
Families: model.Config.ModelFamilies,
@@ -942,26 +932,13 @@ func (s *Server) GenerateRoutes() http.Handler {
}
func Serve(ln net.Listener) error {
level := slog.LevelInfo
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
level = slog.LevelDebug
var programLevel = new(slog.LevelVar)
h := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: programLevel, AddSource: true})
slog.SetDefault(slog.New(h))
programLevel.Set(slog.LevelDebug)
slog.Debug("Debug logging enabled")
}
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
if attr.Key == slog.SourceKey {
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
})
slog.SetDefault(slog.New(handler))
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
// clean up unused layers and manifests
if err := PruneLayers(); err != nil {
@@ -1104,7 +1081,11 @@ func ChatHandler(c *gin.Context) {
opts, err := modelOptions(model, req.Options)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
if errors.Is(err, api.ErrInvalidOpts) {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -1134,20 +1115,12 @@ func ChatHandler(c *gin.Context) {
checkpointLoaded := time.Now()
chat, err := model.ChatPrompts(req.Messages)
prompt, images, err := model.ChatPrompt(req.Messages)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
prompt, images, err := trimmedPrompt(c.Request.Context(), chat, model)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
slog.Debug("chat handler", "prompt", prompt)
ch := make(chan any)
go func() {
@@ -1221,115 +1194,3 @@ func ChatHandler(c *gin.Context) {
streamResponse(c, ch)
}
// promptInfo stores the variables used to template a prompt, and the token length of the resulting template for some model
type promptInfo struct {
vars PromptVars
tokenLen int
}
// trimmedPrompt builds a prompt to send to a running model. It ensures the prompt fits within the max context length,
// while preserving the most recent system message.
func trimmedPrompt(ctx context.Context, chat *ChatHistory, model *Model) (string, []llm.ImageData, error) {
if len(chat.Prompts) == 0 {
return "", nil, nil
}
var promptsToAdd []promptInfo
var totalTokenLength int
var systemPromptIncluded bool
var images []llm.ImageData
// reverse iterate through the prompts to build the prompt string in a way that fits the max context length
for i := len(chat.Prompts) - 1; i >= 0; i-- {
prompt := chat.Prompts[i]
promptText, err := promptString(model, prompt, i == len(chat.Prompts)-1)
if err != nil {
return "", nil, err
}
encodedTokens, err := loaded.runner.Encode(ctx, promptText)
if err != nil {
return "", nil, err
}
if totalTokenLength+len(encodedTokens) > loaded.NumCtx && i != len(chat.Prompts)-1 {
break // reached max context length, stop adding more prompts
}
for j := range prompt.Images {
if totalTokenLength+768 > loaded.NumCtx {
// this decreases the token length but overestimating is fine
prompt.Prompt = strings.ReplaceAll(prompt.Prompt, fmt.Sprintf(" [img-%d]", prompt.Images[j].ID), "")
continue
}
totalTokenLength += 768
images = append(images, prompt.Images[j])
}
totalTokenLength += len(encodedTokens)
systemPromptIncluded = systemPromptIncluded || prompt.System != ""
promptsToAdd = append(promptsToAdd, promptInfo{vars: prompt, tokenLen: len(encodedTokens)})
}
// ensure the system prompt is included, if not already
if chat.LastSystem != "" && !systemPromptIncluded {
var err error
promptsToAdd, err = includeSystemPrompt(ctx, chat.LastSystem, totalTokenLength, promptsToAdd)
if err != nil {
return "", nil, err
}
}
promptsToAdd[len(promptsToAdd)-1].vars.First = true
// construct the final prompt string from the prompts which fit within the context window
var result string
for i, prompt := range promptsToAdd {
promptText, err := promptString(model, prompt.vars, i == 0)
if err != nil {
return "", nil, err
}
result = promptText + result
}
return result, images, nil
}
// promptString applies the model template to the prompt
func promptString(model *Model, vars PromptVars, isMostRecent bool) (string, error) {
if isMostRecent {
p, err := model.PreResponsePrompt(vars)
if err != nil {
return "", fmt.Errorf("pre-response template: %w", err)
}
return p, nil
}
p, err := Prompt(model.Template, vars)
if err != nil {
return "", err
}
return p, nil
}
// includeSystemPrompt adjusts the prompts to include the system prompt.
func includeSystemPrompt(ctx context.Context, systemPrompt string, totalTokenLength int, promptsToAdd []promptInfo) ([]promptInfo, error) {
systemTokens, err := loaded.runner.Encode(ctx, systemPrompt)
if err != nil {
return nil, err
}
for i := len(promptsToAdd) - 1; i >= 0; i-- {
if totalTokenLength+len(systemTokens) <= loaded.NumCtx {
promptsToAdd[i].vars.System = systemPrompt
return promptsToAdd[:i+1], nil
}
totalTokenLength -= promptsToAdd[i].tokenLen
}
// if got here, system did not fit anywhere, so return the most recent prompt with the system message set
recent := promptsToAdd[len(promptsToAdd)-1]
recent.vars.System = systemPrompt
return []promptInfo{recent}, nil
}

View File

@@ -16,7 +16,6 @@ import (
"github.com/stretchr/testify/assert"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/llm"
"github.com/jmorganca/ollama/parser"
"github.com/jmorganca/ollama/version"
)
@@ -240,258 +239,3 @@ func Test_Routes(t *testing.T) {
}
}
func Test_ChatPrompt(t *testing.T) {
tests := []struct {
name string
template string
chat *ChatHistory
numCtx int
runner MockLLM
want string
wantErr string
}{
{
name: "Single Message",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
{
System: "You are a Wizard.",
Prompt: "What are the potion ingredients?",
First: true,
},
},
LastSystem: "You are a Wizard.",
},
numCtx: 1,
runner: MockLLM{
encoding: []int{1}, // fit the ctxLen
},
want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
},
{
name: "First Message",
template: "[INST] {{if .First}}Hello!{{end}} {{ .System }} {{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
{
System: "You are a Wizard.",
Prompt: "What are the potion ingredients?",
Response: "eye of newt",
First: true,
},
{
Prompt: "Anything else?",
},
},
LastSystem: "You are a Wizard.",
},
numCtx: 2,
runner: MockLLM{
encoding: []int{1}, // fit the ctxLen
},
want: "[INST] Hello! You are a Wizard. What are the potion ingredients? [/INST]eye of newt[INST] Anything else? [/INST]",
},
{
name: "Message History",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
{
System: "You are a Wizard.",
Prompt: "What are the potion ingredients?",
Response: "sugar",
First: true,
},
{
Prompt: "Anything else?",
},
},
LastSystem: "You are a Wizard.",
},
numCtx: 4,
runner: MockLLM{
encoding: []int{1}, // fit the ctxLen, 1 for each message
},
want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]sugar[INST] Anything else? [/INST]",
},
{
name: "Assistant Only",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
{
Response: "everything nice",
First: true,
},
},
},
numCtx: 1,
runner: MockLLM{
encoding: []int{1},
},
want: "[INST] [/INST]everything nice",
},
{
name: "Message History Truncated, No System",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
{
Prompt: "What are the potion ingredients?",
Response: "sugar",
First: true,
},
{
Prompt: "Anything else?",
Response: "spice",
},
{
Prompt: "... and?",
},
},
},
numCtx: 2, // only 1 message from history and most recent message
runner: MockLLM{
encoding: []int{1},
},
want: "[INST] Anything else? [/INST]spice[INST] ... and? [/INST]",
},
{
name: "System is Preserved when Truncated",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
{
Prompt: "What are the magic words?",
Response: "abracadabra",
},
{
Prompt: "What is the spell for invisibility?",
},
},
LastSystem: "You are a wizard.",
},
numCtx: 2,
runner: MockLLM{
encoding: []int{1},
},
want: "[INST] You are a wizard. What is the spell for invisibility? [/INST]",
},
{
name: "System is Preserved when Length Exceeded",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
{
Prompt: "What are the magic words?",
Response: "abracadabra",
},
{
Prompt: "What is the spell for invisibility?",
},
},
LastSystem: "You are a wizard.",
},
numCtx: 1,
runner: MockLLM{
encoding: []int{1},
},
want: "[INST] You are a wizard. What is the spell for invisibility? [/INST]",
},
{
name: "First is Preserved when Truncated",
template: "[INST] {{ if .First }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
// first message omitted for test
{
Prompt: "Do you have a magic hat?",
Response: "Of course.",
},
{
Prompt: "What is the spell for invisibility?",
},
},
LastSystem: "You are a wizard.",
},
numCtx: 3, // two most recent messages and room for system message
runner: MockLLM{
encoding: []int{1},
},
want: "[INST] You are a wizard. Do you have a magic hat? [/INST]Of course.[INST] What is the spell for invisibility? [/INST]",
},
{
name: "Most recent message is returned when longer than ctxLen",
template: "[INST] {{ .Prompt }} [/INST]",
chat: &ChatHistory{
Prompts: []PromptVars{
{
Prompt: "What is the spell for invisibility?",
First: true,
},
},
},
numCtx: 1, // two most recent messages
runner: MockLLM{
encoding: []int{1, 2},
},
want: "[INST] What is the spell for invisibility? [/INST]",
},
}
for _, testCase := range tests {
tt := testCase
m := &Model{
Template: tt.template,
}
t.Run(tt.name, func(t *testing.T) {
loaded.runner = &tt.runner
loaded.Options = &api.Options{
Runner: api.Runner{
NumCtx: tt.numCtx,
},
}
// TODO: add tests for trimming images
got, _, err := trimmedPrompt(context.Background(), tt.chat, m)
if tt.wantErr != "" {
if err == nil {
t.Errorf("ChatPrompt() expected error, got nil")
}
if !strings.Contains(err.Error(), tt.wantErr) {
t.Errorf("ChatPrompt() error = %v, wantErr %v", err, tt.wantErr)
}
}
if got != tt.want {
t.Errorf("ChatPrompt() got = %v, want %v", got, tt.want)
}
})
}
}
type MockLLM struct {
encoding []int
}
func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
return nil
}
func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
return llm.encoding, nil
}
func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
return "", nil
}
func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
return []float64{}, nil
}
func (llm *MockLLM) Close() {
// do nothing
}