Compare commits

..

1 Commits

Author SHA1 Message Date
Matt Williams
0d4fa34aee Added mention of the NOPRUNE env var
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-12-08 17:37:12 -08:00
7 changed files with 84 additions and 114 deletions

View File

@@ -203,12 +203,22 @@ type GenerateResponse struct {
CreatedAt time.Time `json:"created_at"`
Response string `json:"response"`
ModelConfiguration ModelConfiguration `json:"model_configuration"`
Done bool `json:"done"`
Context []int `json:"context,omitempty"`
Metrics
}
type ModelConfiguration struct {
ModelFormat string `json:"model_format"`
ModelFamily string `json:"model_family"`
ModelFamilies []string `json:"model_families"`
ModelType string `json:"model_type"`
FileType string `json:"file_type"`
}
func (m *Metrics) Summary() {
if m.TotalDuration > 0 {
fmt.Fprintf(os.Stderr, "total duration: %v\n", m.TotalDuration)

View File

@@ -252,7 +252,7 @@ curl http://localhost:11434/api/generate -d '{
"penalize_newline": true,
"stop": ["\n", "user:"],
"numa": false,
"num_ctx": 1024,
"num_ctx": 4,
"num_batch": 2,
"num_gqa": 1,
"num_gpu": 1,
@@ -267,7 +267,7 @@ curl http://localhost:11434/api/generate -d '{
"rope_frequency_base": 1.1,
"rope_frequency_scale": 0.8,
"num_thread": 8
}
}
}'
```

View File

@@ -95,6 +95,10 @@ The manifest lists all the layers used in this model. You will see a `media type
To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.
### I downloaded most of a model yesterday, but it's gone today. What happened?
When the Ollama server starts, it looks for fragments of models that still exist on the system and cleans them out. If you have an Internet connection that can't complete a model download all at once, this can be frustrating. Adding the OLLAMA_NOPRUNE environment variable will prevent the server from pruning incomplete files.
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.

View File

@@ -93,8 +93,6 @@ func (c *containerGGML) Name() string {
}
func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
// file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}
@@ -117,10 +115,6 @@ func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
}
c.version = version
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}
@@ -147,10 +141,6 @@ func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
// different model types may have different layouts for hyperparameters
var llama llamaModel
binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return &llama, nil
}
@@ -173,10 +163,6 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
}
c.version = version
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}

View File

@@ -59,7 +59,6 @@ ws ::= ([ \t\n] ws)?
var llamaCppEmbed embed.FS
type ModelRunner struct {
Type string // "gguf" or "ggml"
Path string // path to the model runner executable
Accelerated bool
}
@@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
switch runtime.GOOS {
case "darwin":
if runtime.GOARCH == "arm64" {
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
} else {
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
}
case "linux":
runners = []ModelRunner{
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
}
case "windows":
// TODO: select windows GPU runner here when available
runners = []ModelRunner{
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
}
default:
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
runners = []ModelRunner{
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
}
}
@@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
for _, r := range runners {
// clean the ModelRunner paths so that they match the OS we are running on
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
Type: r.Type,
Path: filepath.Clean(path.Join(workDir, r.Path)),
Accelerated: r.Accelerated,
})
@@ -404,17 +402,11 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
}
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
params := append(params, "--port", strconv.Itoa(port))
if runner.Type == "gguf" {
params = append(params, "--parallel", "2")
}
ctx, cancel := context.WithCancel(context.Background())
cmd := exec.CommandContext(
ctx,
runner.Path,
params...,
append(params, "--port", strconv.Itoa(port))...,
)
var libraryPaths []string
@@ -545,6 +537,7 @@ type prediction struct {
const maxBufferSize = 512 * format.KiloByte
type PredictOpts struct {
Model string
Prompt string
Format string
CheckpointStart time.Time
@@ -552,6 +545,7 @@ type PredictOpts struct {
}
type PredictResult struct {
Model string
CreatedAt time.Time
TotalDuration time.Duration
LoadDuration time.Duration
@@ -637,35 +631,34 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
continue
}
evt, ok := bytes.CutPrefix(line, []byte("data: "))
if !ok {
return fmt.Errorf("error parsing llm response stream: %s", line)
}
if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
var p prediction
if err := json.Unmarshal(evt, &p); err != nil {
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
}
var p prediction
if err := json.Unmarshal(evt, &p); err != nil {
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
}
if p.Content != "" {
fn(PredictResult{
Model: predict.Model,
CreatedAt: time.Now().UTC(),
Content: p.Content,
})
}
if p.Content != "" {
fn(PredictResult{
CreatedAt: time.Now().UTC(),
Content: p.Content,
})
}
if p.Stop {
fn(PredictResult{
Model: predict.Model,
CreatedAt: time.Now().UTC(),
TotalDuration: time.Since(predict.CheckpointStart),
if p.Stop {
fn(PredictResult{
CreatedAt: time.Now().UTC(),
TotalDuration: time.Since(predict.CheckpointStart),
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
}
}
}

View File

@@ -146,16 +146,12 @@ type ManifestV2 struct {
}
type ConfigV2 struct {
ModelFormat string `json:"model_format"`
ModelFamily string `json:"model_family"`
ModelFamilies []string `json:"model_families"`
ModelType string `json:"model_type"`
FileType string `json:"file_type"`
// required by spec
Architecture string `json:"architecture"`
OS string `json:"os"`
RootFS RootFS `json:"rootfs"`
api.ModelConfiguration
}
func (c *ConfigV2) SetModelFormat(format string) {

View File

@@ -199,9 +199,10 @@ func GenerateHandler(c *gin.Context) {
// an empty request loads the model
if req.Prompt == "" && req.Template == "" && req.System == "" {
c.JSON(http.StatusOK, api.GenerateResponse{
CreatedAt: time.Now().UTC(),
Model: req.Model,
Done: true})
CreatedAt: time.Now().UTC(),
Model: req.Model,
ModelConfiguration: model.Config.ModelConfiguration,
Done: true})
return
}
@@ -260,10 +261,11 @@ func GenerateHandler(c *gin.Context) {
}
resp := api.GenerateResponse{
Model: req.Model,
CreatedAt: r.CreatedAt,
Done: r.Done,
Response: r.Content,
Model: r.Model,
ModelConfiguration: model.Config.ModelConfiguration,
CreatedAt: r.CreatedAt,
Done: r.Done,
Response: r.Content,
Metrics: api.Metrics{
TotalDuration: r.TotalDuration,
LoadDuration: r.LoadDuration,
@@ -288,6 +290,7 @@ func GenerateHandler(c *gin.Context) {
// Start prediction
predictReq := llm.PredictOpts{
Model: model.Name,
Prompt: prompt,
Format: req.Format,
CheckpointStart: checkpointStart,
@@ -299,30 +302,19 @@ func GenerateHandler(c *gin.Context) {
}()
if req.Stream != nil && !*req.Stream {
// Accumulate responses into the final response
var final api.GenerateResponse
// Wait for the channel to close
var r api.GenerateResponse
var sb strings.Builder
for resp := range ch {
switch r := resp.(type) {
case api.GenerateResponse:
sb.WriteString(r.Response)
final = r
case gin.H:
if errorMsg, ok := r["error"].(string); ok {
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
return
} else {
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
return
}
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
var ok bool
if r, ok = resp.(api.GenerateResponse); !ok {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
sb.WriteString(r.Response)
}
final.Response = sb.String()
c.JSON(http.StatusOK, final)
r.Response = sb.String()
c.JSON(http.StatusOK, r)
return
}
@@ -860,7 +852,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
if runtime.GOOS == "linux" {
// check compatibility to log warnings
if _, err := llm.CheckVRAM(); err != nil {
log.Print(err.Error())
log.Printf(err.Error())
}
}
@@ -984,7 +976,7 @@ func ChatHandler(c *gin.Context) {
loaded.expireTimer.Reset(sessionDuration)
resp := api.ChatResponse{
Model: req.Model,
Model: r.Model,
CreatedAt: r.CreatedAt,
Done: r.Done,
Metrics: api.Metrics{
@@ -1006,6 +998,7 @@ func ChatHandler(c *gin.Context) {
// Start prediction
predictReq := llm.PredictOpts{
Model: model.Name,
Prompt: prompt,
Format: req.Format,
CheckpointStart: checkpointStart,
@@ -1017,33 +1010,21 @@ func ChatHandler(c *gin.Context) {
}()
if req.Stream != nil && !*req.Stream {
// Accumulate responses into the final response
var final api.ChatResponse
// Wait for the channel to close
var r api.ChatResponse
var sb strings.Builder
for resp := range ch {
switch r := resp.(type) {
case api.ChatResponse:
if r.Message != nil {
sb.WriteString(r.Message.Content)
}
final = r
case gin.H:
if errorMsg, ok := r["error"].(string); ok {
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
return
} else {
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
return
}
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
var ok bool
if r, ok = resp.(api.ChatResponse); !ok {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
if r.Message != nil {
sb.WriteString(r.Message.Content)
}
}
final.Message = &api.Message{Role: "assistant", Content: sb.String()}
c.JSON(http.StatusOK, final)
r.Message = &api.Message{Role: "assistant", Content: sb.String()}
c.JSON(http.StatusOK, r)
return
}