Compare commits

..

9 Commits

Author SHA1 Message Date
Jeffrey Morgan
7db5bcf73b fix go-staticcheck warning 2023-12-10 11:44:27 -05:00
Jeffrey Morgan
fa2f095bd9 fix model name returned by /api/generate being different than the model name provided 2023-12-10 11:42:15 -05:00
Jeffrey Morgan
045b855db9 fix error on accumulating final chat response 2023-12-10 11:24:39 -05:00
Jeffrey Morgan
32064a0646 fix empty response when receiving runner error 2023-12-10 10:53:38 -05:00
Jeffrey Morgan
d9a250e9b5 seek to end of file when decoding older model formats 2023-12-09 21:14:35 -05:00
Jeffrey Morgan
944519ed16 seek to eof for older model binaries 2023-12-09 20:48:57 -05:00
Jeffrey Morgan
2dd040d04c do not use --parallel 2 for old runners 2023-12-09 20:17:33 -05:00
Bruce MacDonald
bbe41ce41a fix: parallel queueing race condition caused silent failure (#1445)
* fix: queued request failures

- increase parallel requests to 2 to complete queued request, queueing is managed in ollama

* log steam errors
2023-12-09 14:14:02 -05:00
Jeffrey Morgan
9e1406e4ed Don't expose model information in /api/generate 2023-12-09 02:05:43 -08:00
7 changed files with 114 additions and 84 deletions

View File

@@ -203,22 +203,12 @@ type GenerateResponse struct {
CreatedAt time.Time `json:"created_at"`
Response string `json:"response"`
ModelConfiguration ModelConfiguration `json:"model_configuration"`
Done bool `json:"done"`
Context []int `json:"context,omitempty"`
Metrics
}
type ModelConfiguration struct {
ModelFormat string `json:"model_format"`
ModelFamily string `json:"model_family"`
ModelFamilies []string `json:"model_families"`
ModelType string `json:"model_type"`
FileType string `json:"file_type"`
}
func (m *Metrics) Summary() {
if m.TotalDuration > 0 {
fmt.Fprintf(os.Stderr, "total duration: %v\n", m.TotalDuration)

View File

@@ -252,7 +252,7 @@ curl http://localhost:11434/api/generate -d '{
"penalize_newline": true,
"stop": ["\n", "user:"],
"numa": false,
"num_ctx": 4,
"num_ctx": 1024,
"num_batch": 2,
"num_gqa": 1,
"num_gpu": 1,
@@ -267,7 +267,7 @@ curl http://localhost:11434/api/generate -d '{
"rope_frequency_base": 1.1,
"rope_frequency_scale": 0.8,
"num_thread": 8
}
}
}'
```

View File

@@ -95,10 +95,6 @@ The manifest lists all the layers used in this model. You will see a `media type
To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.
### I downloaded most of a model yesterday, but it's gone today. What happened?
When the Ollama server starts, it looks for fragments of models that still exist on the system and cleans them out. If you have an Internet connection that can't complete a model download all at once, this can be frustrating. Adding the OLLAMA_NOPRUNE environment variable will prevent the server from pruning incomplete files.
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.

View File

@@ -93,6 +93,8 @@ func (c *containerGGML) Name() string {
}
func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
// file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}
@@ -115,6 +117,10 @@ func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
}
c.version = version
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}
@@ -141,6 +147,10 @@ func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
// different model types may have different layouts for hyperparameters
var llama llamaModel
binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return &llama, nil
}
@@ -163,6 +173,10 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
}
c.version = version
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}

View File

@@ -59,6 +59,7 @@ ws ::= ([ \t\n] ws)?
var llamaCppEmbed embed.FS
type ModelRunner struct {
Type string // "gguf" or "ggml"
Path string // path to the model runner executable
Accelerated bool
}
@@ -72,25 +73,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
switch runtime.GOOS {
case "darwin":
if runtime.GOARCH == "arm64" {
runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
} else {
runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
}
case "linux":
runners = []ModelRunner{
{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
}
case "windows":
// TODO: select windows GPU runner here when available
runners = []ModelRunner{
{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
}
default:
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
runners = []ModelRunner{
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
}
}
@@ -148,6 +149,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
for _, r := range runners {
// clean the ModelRunner paths so that they match the OS we are running on
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
Type: r.Type,
Path: filepath.Clean(path.Join(workDir, r.Path)),
Accelerated: r.Accelerated,
})
@@ -402,11 +404,17 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
}
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
params := append(params, "--port", strconv.Itoa(port))
if runner.Type == "gguf" {
params = append(params, "--parallel", "2")
}
ctx, cancel := context.WithCancel(context.Background())
cmd := exec.CommandContext(
ctx,
runner.Path,
append(params, "--port", strconv.Itoa(port))...,
params...,
)
var libraryPaths []string
@@ -537,7 +545,6 @@ type prediction struct {
const maxBufferSize = 512 * format.KiloByte
type PredictOpts struct {
Model string
Prompt string
Format string
CheckpointStart time.Time
@@ -545,7 +552,6 @@ type PredictOpts struct {
}
type PredictResult struct {
Model string
CreatedAt time.Time
TotalDuration time.Duration
LoadDuration time.Duration
@@ -631,34 +637,35 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
continue
}
if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
var p prediction
if err := json.Unmarshal(evt, &p); err != nil {
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
}
evt, ok := bytes.CutPrefix(line, []byte("data: "))
if !ok {
return fmt.Errorf("error parsing llm response stream: %s", line)
}
if p.Content != "" {
fn(PredictResult{
Model: predict.Model,
CreatedAt: time.Now().UTC(),
Content: p.Content,
})
}
var p prediction
if err := json.Unmarshal(evt, &p); err != nil {
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
}
if p.Stop {
fn(PredictResult{
Model: predict.Model,
CreatedAt: time.Now().UTC(),
TotalDuration: time.Since(predict.CheckpointStart),
if p.Content != "" {
fn(PredictResult{
CreatedAt: time.Now().UTC(),
Content: p.Content,
})
}
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
if p.Stop {
fn(PredictResult{
CreatedAt: time.Now().UTC(),
TotalDuration: time.Since(predict.CheckpointStart),
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
}
}

View File

@@ -146,12 +146,16 @@ type ManifestV2 struct {
}
type ConfigV2 struct {
ModelFormat string `json:"model_format"`
ModelFamily string `json:"model_family"`
ModelFamilies []string `json:"model_families"`
ModelType string `json:"model_type"`
FileType string `json:"file_type"`
// required by spec
Architecture string `json:"architecture"`
OS string `json:"os"`
RootFS RootFS `json:"rootfs"`
api.ModelConfiguration
}
func (c *ConfigV2) SetModelFormat(format string) {

View File

@@ -199,10 +199,9 @@ func GenerateHandler(c *gin.Context) {
// an empty request loads the model
if req.Prompt == "" && req.Template == "" && req.System == "" {
c.JSON(http.StatusOK, api.GenerateResponse{
CreatedAt: time.Now().UTC(),
Model: req.Model,
ModelConfiguration: model.Config.ModelConfiguration,
Done: true})
CreatedAt: time.Now().UTC(),
Model: req.Model,
Done: true})
return
}
@@ -261,11 +260,10 @@ func GenerateHandler(c *gin.Context) {
}
resp := api.GenerateResponse{
Model: r.Model,
ModelConfiguration: model.Config.ModelConfiguration,
CreatedAt: r.CreatedAt,
Done: r.Done,
Response: r.Content,
Model: req.Model,
CreatedAt: r.CreatedAt,
Done: r.Done,
Response: r.Content,
Metrics: api.Metrics{
TotalDuration: r.TotalDuration,
LoadDuration: r.LoadDuration,
@@ -290,7 +288,6 @@ func GenerateHandler(c *gin.Context) {
// Start prediction
predictReq := llm.PredictOpts{
Model: model.Name,
Prompt: prompt,
Format: req.Format,
CheckpointStart: checkpointStart,
@@ -302,19 +299,30 @@ func GenerateHandler(c *gin.Context) {
}()
if req.Stream != nil && !*req.Stream {
// Wait for the channel to close
var r api.GenerateResponse
// Accumulate responses into the final response
var final api.GenerateResponse
var sb strings.Builder
for resp := range ch {
var ok bool
if r, ok = resp.(api.GenerateResponse); !ok {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
switch r := resp.(type) {
case api.GenerateResponse:
sb.WriteString(r.Response)
final = r
case gin.H:
if errorMsg, ok := r["error"].(string); ok {
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
return
} else {
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
return
}
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
return
}
sb.WriteString(r.Response)
}
r.Response = sb.String()
c.JSON(http.StatusOK, r)
final.Response = sb.String()
c.JSON(http.StatusOK, final)
return
}
@@ -852,7 +860,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
if runtime.GOOS == "linux" {
// check compatibility to log warnings
if _, err := llm.CheckVRAM(); err != nil {
log.Printf(err.Error())
log.Print(err.Error())
}
}
@@ -976,7 +984,7 @@ func ChatHandler(c *gin.Context) {
loaded.expireTimer.Reset(sessionDuration)
resp := api.ChatResponse{
Model: r.Model,
Model: req.Model,
CreatedAt: r.CreatedAt,
Done: r.Done,
Metrics: api.Metrics{
@@ -998,7 +1006,6 @@ func ChatHandler(c *gin.Context) {
// Start prediction
predictReq := llm.PredictOpts{
Model: model.Name,
Prompt: prompt,
Format: req.Format,
CheckpointStart: checkpointStart,
@@ -1010,21 +1017,33 @@ func ChatHandler(c *gin.Context) {
}()
if req.Stream != nil && !*req.Stream {
// Wait for the channel to close
var r api.ChatResponse
// Accumulate responses into the final response
var final api.ChatResponse
var sb strings.Builder
for resp := range ch {
var ok bool
if r, ok = resp.(api.ChatResponse); !ok {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
switch r := resp.(type) {
case api.ChatResponse:
if r.Message != nil {
sb.WriteString(r.Message.Content)
}
final = r
case gin.H:
if errorMsg, ok := r["error"].(string); ok {
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
return
} else {
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
return
}
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
return
}
if r.Message != nil {
sb.WriteString(r.Message.Content)
}
}
r.Message = &api.Message{Role: "assistant", Content: sb.String()}
c.JSON(http.StatusOK, r)
final.Message = &api.Message{Role: "assistant", Content: sb.String()}
c.JSON(http.StatusOK, final)
return
}