Added mention of the NOPRUNE env var

Signed-off-by: Matt Williams <m@technovangelist.com>
2023-12-08 17:37:12 -08:00
7 changed files with 84 additions and 114 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -203,12 +203,22 @@ type GenerateResponse struct {
 	CreatedAt time.Time `json:"created_at"`
 	Response  string    `json:"response"`

+	ModelConfiguration ModelConfiguration `json:"model_configuration"`
+
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`

 	Metrics
 }

+type ModelConfiguration struct {
+	ModelFormat   string   `json:"model_format"`
+	ModelFamily   string   `json:"model_family"`
+	ModelFamilies []string `json:"model_families"`
+	ModelType     string   `json:"model_type"`
+	FileType      string   `json:"file_type"`
+}
+
 func (m *Metrics) Summary() {
 	if m.TotalDuration > 0 {
 		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
--- a/docs/api.md
+++ b/docs/api.md
@@ -252,7 +252,7 @@ curl http://localhost:11434/api/generate -d '{
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
-    "num_ctx": 1024,
+    "num_ctx": 4,
    "num_batch": 2,
    "num_gqa": 1,
    "num_gpu": 1,
@@ -267,7 +267,7 @@ curl http://localhost:11434/api/generate -d '{
    "rope_frequency_base": 1.1,
    "rope_frequency_scale": 0.8,
    "num_thread": 8
-  }
+    }
 }'
 ```

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -95,6 +95,10 @@ The manifest lists all the layers used in this model. You will see a `media type

 To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.

+### I downloaded most of a model yesterday, but it's gone today. What happened?
+
+When the Ollama server starts, it looks for fragments of models that still exist on the system and cleans them out. If you have an Internet connection that can't complete a model download all at once, this can be frustrating. Adding the OLLAMA_NOPRUNE environment variable will prevent the server from pruning incomplete files.
+
 ## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?

 No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -93,8 +93,6 @@ func (c *containerGGML) Name() string {
 }

 func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
-	// file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
 	return nil, nil
 }

@@ -117,10 +115,6 @@ func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
 	return nil, nil
 }

@@ -147,10 +141,6 @@ func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
 	// different model types may have different layouts for hyperparameters
 	var llama llamaModel
 	binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
 	return &llama, nil
 }

@@ -173,10 +163,6 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
 	return nil, nil
 }

--- a/llm/llama.go
+++ b/llm/llama.go
@@ -59,7 +59,6 @@ ws ::= ([ \t\n] ws)?
 var llamaCppEmbed embed.FS

 type ModelRunner struct {
-	Type        string // "gguf" or "ggml"
 	Path        string // path to the model runner executable
 	Accelerated bool
 }
@@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	switch runtime.GOOS {
 	case "darwin":
 		if runtime.GOARCH == "arm64" {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
 		} else {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
 		}
 	case "linux":
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
+			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
+			{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
+			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	}

@@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	for _, r := range runners {
 		// clean the ModelRunner paths so that they match the OS we are running on
 		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
-			Type:        r.Type,
 			Path:        filepath.Clean(path.Join(workDir, r.Path)),
 			Accelerated: r.Accelerated,
 		})
@@ -404,17 +402,11 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
 		}

 		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-		params := append(params, "--port", strconv.Itoa(port))
-
-		if runner.Type == "gguf" {
-			params = append(params, "--parallel", "2")
-		}
-
 		ctx, cancel := context.WithCancel(context.Background())
 		cmd := exec.CommandContext(
 			ctx,
 			runner.Path,
-			params...,
+			append(params, "--port", strconv.Itoa(port))...,
 		)

 		var libraryPaths []string
@@ -545,6 +537,7 @@ type prediction struct {
 const maxBufferSize = 512 * format.KiloByte

 type PredictOpts struct {
+	Model            string
 	Prompt           string
 	Format           string
 	CheckpointStart  time.Time
@@ -552,6 +545,7 @@ type PredictOpts struct {
 }

 type PredictResult struct {
+	Model              string
 	CreatedAt          time.Time
 	TotalDuration      time.Duration
 	LoadDuration       time.Duration
@@ -637,35 +631,34 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 				continue
 			}

-			evt, ok := bytes.CutPrefix(line, []byte("data: "))
-			if !ok {
-				return fmt.Errorf("error parsing llm response stream: %s", line)
-			}
+			if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
+				var p prediction
+				if err := json.Unmarshal(evt, &p); err != nil {
+					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
+				}

-			var p prediction
-			if err := json.Unmarshal(evt, &p); err != nil {
-				return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
-			}
+				if p.Content != "" {
+					fn(PredictResult{
+						Model:     predict.Model,
+						CreatedAt: time.Now().UTC(),
+						Content:   p.Content,
+					})
+				}

-			if p.Content != "" {
-				fn(PredictResult{
-					CreatedAt: time.Now().UTC(),
-					Content:   p.Content,
-				})
-			}
+				if p.Stop {
+					fn(PredictResult{
+						Model:         predict.Model,
+						CreatedAt:     time.Now().UTC(),
+						TotalDuration: time.Since(predict.CheckpointStart),

-			if p.Stop {
-				fn(PredictResult{
-					CreatedAt:     time.Now().UTC(),
-					TotalDuration: time.Since(predict.CheckpointStart),
-
-					Done:               true,
-					PromptEvalCount:    p.Timings.PromptN,
-					PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
-					EvalCount:          p.Timings.PredictedN,
-					EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
-				})
-				return nil
+						Done:               true,
+						PromptEvalCount:    p.Timings.PromptN,
+						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+						EvalCount:          p.Timings.PredictedN,
+						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+					})
+					return nil
+				}
 			}
 		}
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -146,16 +146,12 @@ type ManifestV2 struct {
 }

 type ConfigV2 struct {
-	ModelFormat   string   `json:"model_format"`
-	ModelFamily   string   `json:"model_family"`
-	ModelFamilies []string `json:"model_families"`
-	ModelType     string   `json:"model_type"`
-	FileType      string   `json:"file_type"`
-
 	// required by spec
 	Architecture string `json:"architecture"`
 	OS           string `json:"os"`
 	RootFS       RootFS `json:"rootfs"`
+
+	api.ModelConfiguration
 }

 func (c *ConfigV2) SetModelFormat(format string) {
--- a/server/routes.go
+++ b/server/routes.go
@@ -199,9 +199,10 @@ func GenerateHandler(c *gin.Context) {
 	// an empty request loads the model
 	if req.Prompt == "" && req.Template == "" && req.System == "" {
 		c.JSON(http.StatusOK, api.GenerateResponse{
-			CreatedAt: time.Now().UTC(),
-			Model:     req.Model,
-			Done:      true})
+			CreatedAt:          time.Now().UTC(),
+			Model:              req.Model,
+			ModelConfiguration: model.Config.ModelConfiguration,
+			Done:               true})
 		return
 	}

@@ -260,10 +261,11 @@ func GenerateHandler(c *gin.Context) {
 			}

 			resp := api.GenerateResponse{
-				Model:     req.Model,
-				CreatedAt: r.CreatedAt,
-				Done:      r.Done,
-				Response:  r.Content,
+				Model:              r.Model,
+				ModelConfiguration: model.Config.ModelConfiguration,
+				CreatedAt:          r.CreatedAt,
+				Done:               r.Done,
+				Response:           r.Content,
 				Metrics: api.Metrics{
 					TotalDuration:      r.TotalDuration,
 					LoadDuration:       r.LoadDuration,
@@ -288,6 +290,7 @@ func GenerateHandler(c *gin.Context) {

 		// Start prediction
 		predictReq := llm.PredictOpts{
+			Model:            model.Name,
 			Prompt:           prompt,
 			Format:           req.Format,
 			CheckpointStart:  checkpointStart,
@@ -299,30 +302,19 @@ func GenerateHandler(c *gin.Context) {
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Accumulate responses into the final response
-		var final api.GenerateResponse
+		// Wait for the channel to close
+		var r api.GenerateResponse
 		var sb strings.Builder
 		for resp := range ch {
-			switch r := resp.(type) {
-			case api.GenerateResponse:
-				sb.WriteString(r.Response)
-				final = r
-			case gin.H:
-				if errorMsg, ok := r["error"].(string); ok {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
-					return
-				} else {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
-					return
-				}
-			default:
-				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
+			var ok bool
+			if r, ok = resp.(api.GenerateResponse); !ok {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
+			sb.WriteString(r.Response)
 		}
-
-		final.Response = sb.String()
-		c.JSON(http.StatusOK, final)
+		r.Response = sb.String()
+		c.JSON(http.StatusOK, r)
 		return
 	}

@@ -860,7 +852,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	if runtime.GOOS == "linux" {
 		// check compatibility to log warnings
 		if _, err := llm.CheckVRAM(); err != nil {
-			log.Print(err.Error())
+			log.Printf(err.Error())
 		}
 	}

@@ -984,7 +976,7 @@ func ChatHandler(c *gin.Context) {
 			loaded.expireTimer.Reset(sessionDuration)

 			resp := api.ChatResponse{
-				Model:     req.Model,
+				Model:     r.Model,
 				CreatedAt: r.CreatedAt,
 				Done:      r.Done,
 				Metrics: api.Metrics{
@@ -1006,6 +998,7 @@ func ChatHandler(c *gin.Context) {

 		// Start prediction
 		predictReq := llm.PredictOpts{
+			Model:            model.Name,
 			Prompt:           prompt,
 			Format:           req.Format,
 			CheckpointStart:  checkpointStart,
@@ -1017,33 +1010,21 @@ func ChatHandler(c *gin.Context) {
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Accumulate responses into the final response
-		var final api.ChatResponse
+		// Wait for the channel to close
+		var r api.ChatResponse
 		var sb strings.Builder
 		for resp := range ch {
-			switch r := resp.(type) {
-			case api.ChatResponse:
-				if r.Message != nil {
-					sb.WriteString(r.Message.Content)
-				}
-
-				final = r
-			case gin.H:
-				if errorMsg, ok := r["error"].(string); ok {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
-					return
-				} else {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
-					return
-				}
-			default:
-				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
+			var ok bool
+			if r, ok = resp.(api.ChatResponse); !ok {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
+			if r.Message != nil {
+				sb.WriteString(r.Message.Content)
+			}
 		}
-
-		final.Message = &api.Message{Role: "assistant", Content: sb.String()}
-		c.JSON(http.StatusOK, final)
+		r.Message = &api.Message{Role: "assistant", Content: sb.String()}
+		c.JSON(http.StatusOK, r)
 		return
 	}