fix go-staticcheck warning

fix model name returned by /api/generate being different than the model name provided
fix error on accumulating final chat response
2023-12-10 11:44:27 -05:00 · 2023-12-10 11:42:15 -05:00 · 2023-12-10 11:24:39 -05:00 · 2023-12-10 10:53:38 -05:00 · 2023-12-09 21:14:35 -05:00 · 2023-12-09 20:48:57 -05:00
7 changed files with 114 additions and 84 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -203,22 +203,12 @@ type GenerateResponse struct {
 	CreatedAt time.Time `json:"created_at"`
 	Response  string    `json:"response"`

-	ModelConfiguration ModelConfiguration `json:"model_configuration"`
-
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`

 	Metrics
 }

-type ModelConfiguration struct {
-	ModelFormat   string   `json:"model_format"`
-	ModelFamily   string   `json:"model_family"`
-	ModelFamilies []string `json:"model_families"`
-	ModelType     string   `json:"model_type"`
-	FileType      string   `json:"file_type"`
-}
-
 func (m *Metrics) Summary() {
 	if m.TotalDuration > 0 {
 		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
--- a/docs/api.md
+++ b/docs/api.md
@@ -252,7 +252,7 @@ curl http://localhost:11434/api/generate -d '{
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
-    "num_ctx": 4,
+    "num_ctx": 1024,
    "num_batch": 2,
    "num_gqa": 1,
    "num_gpu": 1,
@@ -267,7 +267,7 @@ curl http://localhost:11434/api/generate -d '{
    "rope_frequency_base": 1.1,
    "rope_frequency_scale": 0.8,
    "num_thread": 8
-    }
+  }
 }'
 ```

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -95,10 +95,6 @@ The manifest lists all the layers used in this model. You will see a `media type

 To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.

-### I downloaded most of a model yesterday, but it's gone today. What happened?
-
-When the Ollama server starts, it looks for fragments of models that still exist on the system and cleans them out. If you have an Internet connection that can't complete a model download all at once, this can be frustrating. Adding the OLLAMA_NOPRUNE environment variable will prevent the server from pruning incomplete files.
-
 ## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?

 No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -93,6 +93,8 @@ func (c *containerGGML) Name() string {
 }

 func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
+	// file contents aren't decoded
+	ro.Seek(0, io.SeekEnd)
 	return nil, nil
 }

@@ -115,6 +117,10 @@ func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
+
+	// remaining file contents aren't decoded
+	ro.Seek(0, io.SeekEnd)
+
 	return nil, nil
 }

@@ -141,6 +147,10 @@ func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
 	// different model types may have different layouts for hyperparameters
 	var llama llamaModel
 	binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
+
+	// remaining file contents aren't decoded
+	ro.Seek(0, io.SeekEnd)
+
 	return &llama, nil
 }

@@ -163,6 +173,10 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
+
+	// remaining file contents aren't decoded
+	ro.Seek(0, io.SeekEnd)
+
 	return nil, nil
 }

--- a/llm/llama.go
+++ b/llm/llama.go
@@ -59,6 +59,7 @@ ws ::= ([ \t\n] ws)?
 var llamaCppEmbed embed.FS

 type ModelRunner struct {
+	Type        string // "gguf" or "ggml"
 	Path        string // path to the model runner executable
 	Accelerated bool
 }
@@ -72,25 +73,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	switch runtime.GOOS {
 	case "darwin":
 		if runtime.GOARCH == "arm64" {
-			runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
 		} else {
-			runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
 		}
 	case "linux":
 		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
-			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
+			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
-			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
+			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
+			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
 		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	}

@@ -148,6 +149,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	for _, r := range runners {
 		// clean the ModelRunner paths so that they match the OS we are running on
 		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
+			Type:        r.Type,
 			Path:        filepath.Clean(path.Join(workDir, r.Path)),
 			Accelerated: r.Accelerated,
 		})
@@ -402,11 +404,17 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
 		}

 		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
+		params := append(params, "--port", strconv.Itoa(port))
+
+		if runner.Type == "gguf" {
+			params = append(params, "--parallel", "2")
+		}
+
 		ctx, cancel := context.WithCancel(context.Background())
 		cmd := exec.CommandContext(
 			ctx,
 			runner.Path,
-			append(params, "--port", strconv.Itoa(port))...,
+			params...,
 		)

 		var libraryPaths []string
@@ -537,7 +545,6 @@ type prediction struct {
 const maxBufferSize = 512 * format.KiloByte

 type PredictOpts struct {
-	Model            string
 	Prompt           string
 	Format           string
 	CheckpointStart  time.Time
@@ -545,7 +552,6 @@ type PredictOpts struct {
 }

 type PredictResult struct {
-	Model              string
 	CreatedAt          time.Time
 	TotalDuration      time.Duration
 	LoadDuration       time.Duration
@@ -631,34 +637,35 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 				continue
 			}

-			if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
-				var p prediction
-				if err := json.Unmarshal(evt, &p); err != nil {
-					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
-				}
+			evt, ok := bytes.CutPrefix(line, []byte("data: "))
+			if !ok {
+				return fmt.Errorf("error parsing llm response stream: %s", line)
+			}

-				if p.Content != "" {
-					fn(PredictResult{
-						Model:     predict.Model,
-						CreatedAt: time.Now().UTC(),
-						Content:   p.Content,
-					})
-				}
+			var p prediction
+			if err := json.Unmarshal(evt, &p); err != nil {
+				return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
+			}

-				if p.Stop {
-					fn(PredictResult{
-						Model:         predict.Model,
-						CreatedAt:     time.Now().UTC(),
-						TotalDuration: time.Since(predict.CheckpointStart),
+			if p.Content != "" {
+				fn(PredictResult{
+					CreatedAt: time.Now().UTC(),
+					Content:   p.Content,
+				})
+			}

-						Done:               true,
-						PromptEvalCount:    p.Timings.PromptN,
-						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
-						EvalCount:          p.Timings.PredictedN,
-						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
-					})
-					return nil
-				}
+			if p.Stop {
+				fn(PredictResult{
+					CreatedAt:     time.Now().UTC(),
+					TotalDuration: time.Since(predict.CheckpointStart),
+
+					Done:               true,
+					PromptEvalCount:    p.Timings.PromptN,
+					PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+					EvalCount:          p.Timings.PredictedN,
+					EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+				})
+				return nil
 			}
 		}
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -146,12 +146,16 @@ type ManifestV2 struct {
 }

 type ConfigV2 struct {
+	ModelFormat   string   `json:"model_format"`
+	ModelFamily   string   `json:"model_family"`
+	ModelFamilies []string `json:"model_families"`
+	ModelType     string   `json:"model_type"`
+	FileType      string   `json:"file_type"`
+
 	// required by spec
 	Architecture string `json:"architecture"`
 	OS           string `json:"os"`
 	RootFS       RootFS `json:"rootfs"`
-
-	api.ModelConfiguration
 }

 func (c *ConfigV2) SetModelFormat(format string) {
--- a/server/routes.go
+++ b/server/routes.go
@@ -199,10 +199,9 @@ func GenerateHandler(c *gin.Context) {
 	// an empty request loads the model
 	if req.Prompt == "" && req.Template == "" && req.System == "" {
 		c.JSON(http.StatusOK, api.GenerateResponse{
-			CreatedAt:          time.Now().UTC(),
-			Model:              req.Model,
-			ModelConfiguration: model.Config.ModelConfiguration,
-			Done:               true})
+			CreatedAt: time.Now().UTC(),
+			Model:     req.Model,
+			Done:      true})
 		return
 	}

@@ -261,11 +260,10 @@ func GenerateHandler(c *gin.Context) {
 			}

 			resp := api.GenerateResponse{
-				Model:              r.Model,
-				ModelConfiguration: model.Config.ModelConfiguration,
-				CreatedAt:          r.CreatedAt,
-				Done:               r.Done,
-				Response:           r.Content,
+				Model:     req.Model,
+				CreatedAt: r.CreatedAt,
+				Done:      r.Done,
+				Response:  r.Content,
 				Metrics: api.Metrics{
 					TotalDuration:      r.TotalDuration,
 					LoadDuration:       r.LoadDuration,
@@ -290,7 +288,6 @@ func GenerateHandler(c *gin.Context) {

 		// Start prediction
 		predictReq := llm.PredictOpts{
-			Model:            model.Name,
 			Prompt:           prompt,
 			Format:           req.Format,
 			CheckpointStart:  checkpointStart,
@@ -302,19 +299,30 @@ func GenerateHandler(c *gin.Context) {
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Wait for the channel to close
-		var r api.GenerateResponse
+		// Accumulate responses into the final response
+		var final api.GenerateResponse
 		var sb strings.Builder
 		for resp := range ch {
-			var ok bool
-			if r, ok = resp.(api.GenerateResponse); !ok {
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			switch r := resp.(type) {
+			case api.GenerateResponse:
+				sb.WriteString(r.Response)
+				final = r
+			case gin.H:
+				if errorMsg, ok := r["error"].(string); ok {
+					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
+					return
+				} else {
+					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
+					return
+				}
+			default:
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
 				return
 			}
-			sb.WriteString(r.Response)
 		}
-		r.Response = sb.String()
-		c.JSON(http.StatusOK, r)
+
+		final.Response = sb.String()
+		c.JSON(http.StatusOK, final)
 		return
 	}

@@ -852,7 +860,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	if runtime.GOOS == "linux" {
 		// check compatibility to log warnings
 		if _, err := llm.CheckVRAM(); err != nil {
-			log.Printf(err.Error())
+			log.Print(err.Error())
 		}
 	}

@@ -976,7 +984,7 @@ func ChatHandler(c *gin.Context) {
 			loaded.expireTimer.Reset(sessionDuration)

 			resp := api.ChatResponse{
-				Model:     r.Model,
+				Model:     req.Model,
 				CreatedAt: r.CreatedAt,
 				Done:      r.Done,
 				Metrics: api.Metrics{
@@ -998,7 +1006,6 @@ func ChatHandler(c *gin.Context) {

 		// Start prediction
 		predictReq := llm.PredictOpts{
-			Model:            model.Name,
 			Prompt:           prompt,
 			Format:           req.Format,
 			CheckpointStart:  checkpointStart,
@@ -1010,21 +1017,33 @@ func ChatHandler(c *gin.Context) {
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Wait for the channel to close
-		var r api.ChatResponse
+		// Accumulate responses into the final response
+		var final api.ChatResponse
 		var sb strings.Builder
 		for resp := range ch {
-			var ok bool
-			if r, ok = resp.(api.ChatResponse); !ok {
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			switch r := resp.(type) {
+			case api.ChatResponse:
+				if r.Message != nil {
+					sb.WriteString(r.Message.Content)
+				}
+
+				final = r
+			case gin.H:
+				if errorMsg, ok := r["error"].(string); ok {
+					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
+					return
+				} else {
+					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
+					return
+				}
+			default:
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
 				return
 			}
-			if r.Message != nil {
-				sb.WriteString(r.Message.Content)
-			}
 		}
-		r.Message = &api.Message{Role: "assistant", Content: sb.String()}
-		c.JSON(http.StatusOK, r)
+
+		final.Message = &api.Message{Role: "assistant", Content: sb.String()}
+		c.JSON(http.StatusOK, final)
 		return
 	}
Author	SHA1	Message	Date
Jeffrey Morgan	7db5bcf73b	fix `go-staticcheck` warning	2023-12-10 11:44:27 -05:00
Jeffrey Morgan	fa2f095bd9	fix model name returned by `/api/generate` being different than the model name provided	2023-12-10 11:42:15 -05:00
Jeffrey Morgan	045b855db9	fix error on accumulating final chat response	2023-12-10 11:24:39 -05:00
Jeffrey Morgan	32064a0646	fix empty response when receiving runner error	2023-12-10 10:53:38 -05:00
Jeffrey Morgan	d9a250e9b5	seek to end of file when decoding older model formats	2023-12-09 21:14:35 -05:00
Jeffrey Morgan	944519ed16	seek to eof for older model binaries	2023-12-09 20:48:57 -05:00
Jeffrey Morgan	2dd040d04c	do not use `--parallel 2` for old runners	2023-12-09 20:17:33 -05:00
Bruce MacDonald	bbe41ce41a	fix: parallel queueing race condition caused silent failure (#1445 ) * fix: queued request failures - increase parallel requests to 2 to complete queued request, queueing is managed in ollama * log steam errors	2023-12-09 14:14:02 -05:00
Jeffrey Morgan	9e1406e4ed	Don't expose model information in `/api/generate`	2023-12-09 02:05:43 -08:00