Compare commits
9 Commits
mattw/nopr
...
v0.1.14
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7db5bcf73b | ||
|
|
fa2f095bd9 | ||
|
|
045b855db9 | ||
|
|
32064a0646 | ||
|
|
d9a250e9b5 | ||
|
|
944519ed16 | ||
|
|
2dd040d04c | ||
|
|
bbe41ce41a | ||
|
|
9e1406e4ed |
10
api/types.go
10
api/types.go
@@ -203,22 +203,12 @@ type GenerateResponse struct {
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
Response string `json:"response"`
|
||||
|
||||
ModelConfiguration ModelConfiguration `json:"model_configuration"`
|
||||
|
||||
Done bool `json:"done"`
|
||||
Context []int `json:"context,omitempty"`
|
||||
|
||||
Metrics
|
||||
}
|
||||
|
||||
type ModelConfiguration struct {
|
||||
ModelFormat string `json:"model_format"`
|
||||
ModelFamily string `json:"model_family"`
|
||||
ModelFamilies []string `json:"model_families"`
|
||||
ModelType string `json:"model_type"`
|
||||
FileType string `json:"file_type"`
|
||||
}
|
||||
|
||||
func (m *Metrics) Summary() {
|
||||
if m.TotalDuration > 0 {
|
||||
fmt.Fprintf(os.Stderr, "total duration: %v\n", m.TotalDuration)
|
||||
|
||||
@@ -252,7 +252,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
"penalize_newline": true,
|
||||
"stop": ["\n", "user:"],
|
||||
"numa": false,
|
||||
"num_ctx": 4,
|
||||
"num_ctx": 1024,
|
||||
"num_batch": 2,
|
||||
"num_gqa": 1,
|
||||
"num_gpu": 1,
|
||||
@@ -267,7 +267,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
"rope_frequency_base": 1.1,
|
||||
"rope_frequency_scale": 0.8,
|
||||
"num_thread": 8
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
@@ -95,10 +95,6 @@ The manifest lists all the layers used in this model. You will see a `media type
|
||||
|
||||
To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.
|
||||
|
||||
### I downloaded most of a model yesterday, but it's gone today. What happened?
|
||||
|
||||
When the Ollama server starts, it looks for fragments of models that still exist on the system and cleans them out. If you have an Internet connection that can't complete a model download all at once, this can be frustrating. Adding the OLLAMA_NOPRUNE environment variable will prevent the server from pruning incomplete files.
|
||||
|
||||
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
|
||||
|
||||
No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.
|
||||
|
||||
14
llm/ggml.go
14
llm/ggml.go
@@ -93,6 +93,8 @@ func (c *containerGGML) Name() string {
|
||||
}
|
||||
|
||||
func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
|
||||
// file contents aren't decoded
|
||||
ro.Seek(0, io.SeekEnd)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@@ -115,6 +117,10 @@ func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
|
||||
}
|
||||
|
||||
c.version = version
|
||||
|
||||
// remaining file contents aren't decoded
|
||||
ro.Seek(0, io.SeekEnd)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@@ -141,6 +147,10 @@ func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
|
||||
// different model types may have different layouts for hyperparameters
|
||||
var llama llamaModel
|
||||
binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
|
||||
|
||||
// remaining file contents aren't decoded
|
||||
ro.Seek(0, io.SeekEnd)
|
||||
|
||||
return &llama, nil
|
||||
}
|
||||
|
||||
@@ -163,6 +173,10 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
|
||||
}
|
||||
|
||||
c.version = version
|
||||
|
||||
// remaining file contents aren't decoded
|
||||
ro.Seek(0, io.SeekEnd)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
|
||||
77
llm/llama.go
77
llm/llama.go
@@ -59,6 +59,7 @@ ws ::= ([ \t\n] ws)?
|
||||
var llamaCppEmbed embed.FS
|
||||
|
||||
type ModelRunner struct {
|
||||
Type string // "gguf" or "ggml"
|
||||
Path string // path to the model runner executable
|
||||
Accelerated bool
|
||||
}
|
||||
@@ -72,25 +73,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
if runtime.GOARCH == "arm64" {
|
||||
runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
|
||||
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
|
||||
} else {
|
||||
runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
|
||||
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
|
||||
}
|
||||
case "linux":
|
||||
runners = []ModelRunner{
|
||||
{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
|
||||
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||
}
|
||||
case "windows":
|
||||
// TODO: select windows GPU runner here when available
|
||||
runners = []ModelRunner{
|
||||
{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
|
||||
{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
|
||||
}
|
||||
default:
|
||||
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
||||
runners = []ModelRunner{
|
||||
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,6 +149,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||
for _, r := range runners {
|
||||
// clean the ModelRunner paths so that they match the OS we are running on
|
||||
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
|
||||
Type: r.Type,
|
||||
Path: filepath.Clean(path.Join(workDir, r.Path)),
|
||||
Accelerated: r.Accelerated,
|
||||
})
|
||||
@@ -402,11 +404,17 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
|
||||
}
|
||||
|
||||
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||
params := append(params, "--port", strconv.Itoa(port))
|
||||
|
||||
if runner.Type == "gguf" {
|
||||
params = append(params, "--parallel", "2")
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cmd := exec.CommandContext(
|
||||
ctx,
|
||||
runner.Path,
|
||||
append(params, "--port", strconv.Itoa(port))...,
|
||||
params...,
|
||||
)
|
||||
|
||||
var libraryPaths []string
|
||||
@@ -537,7 +545,6 @@ type prediction struct {
|
||||
const maxBufferSize = 512 * format.KiloByte
|
||||
|
||||
type PredictOpts struct {
|
||||
Model string
|
||||
Prompt string
|
||||
Format string
|
||||
CheckpointStart time.Time
|
||||
@@ -545,7 +552,6 @@ type PredictOpts struct {
|
||||
}
|
||||
|
||||
type PredictResult struct {
|
||||
Model string
|
||||
CreatedAt time.Time
|
||||
TotalDuration time.Duration
|
||||
LoadDuration time.Duration
|
||||
@@ -631,34 +637,35 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
|
||||
continue
|
||||
}
|
||||
|
||||
if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
|
||||
var p prediction
|
||||
if err := json.Unmarshal(evt, &p); err != nil {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||
}
|
||||
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||
if !ok {
|
||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||
}
|
||||
|
||||
if p.Content != "" {
|
||||
fn(PredictResult{
|
||||
Model: predict.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Content: p.Content,
|
||||
})
|
||||
}
|
||||
var p prediction
|
||||
if err := json.Unmarshal(evt, &p); err != nil {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||
}
|
||||
|
||||
if p.Stop {
|
||||
fn(PredictResult{
|
||||
Model: predict.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
TotalDuration: time.Since(predict.CheckpointStart),
|
||||
if p.Content != "" {
|
||||
fn(PredictResult{
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Content: p.Content,
|
||||
})
|
||||
}
|
||||
|
||||
Done: true,
|
||||
PromptEvalCount: p.Timings.PromptN,
|
||||
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
|
||||
EvalCount: p.Timings.PredictedN,
|
||||
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
|
||||
})
|
||||
return nil
|
||||
}
|
||||
if p.Stop {
|
||||
fn(PredictResult{
|
||||
CreatedAt: time.Now().UTC(),
|
||||
TotalDuration: time.Since(predict.CheckpointStart),
|
||||
|
||||
Done: true,
|
||||
PromptEvalCount: p.Timings.PromptN,
|
||||
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
|
||||
EvalCount: p.Timings.PredictedN,
|
||||
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
|
||||
})
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,12 +146,16 @@ type ManifestV2 struct {
|
||||
}
|
||||
|
||||
type ConfigV2 struct {
|
||||
ModelFormat string `json:"model_format"`
|
||||
ModelFamily string `json:"model_family"`
|
||||
ModelFamilies []string `json:"model_families"`
|
||||
ModelType string `json:"model_type"`
|
||||
FileType string `json:"file_type"`
|
||||
|
||||
// required by spec
|
||||
Architecture string `json:"architecture"`
|
||||
OS string `json:"os"`
|
||||
RootFS RootFS `json:"rootfs"`
|
||||
|
||||
api.ModelConfiguration
|
||||
}
|
||||
|
||||
func (c *ConfigV2) SetModelFormat(format string) {
|
||||
|
||||
@@ -199,10 +199,9 @@ func GenerateHandler(c *gin.Context) {
|
||||
// an empty request loads the model
|
||||
if req.Prompt == "" && req.Template == "" && req.System == "" {
|
||||
c.JSON(http.StatusOK, api.GenerateResponse{
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
ModelConfiguration: model.Config.ModelConfiguration,
|
||||
Done: true})
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
Done: true})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -261,11 +260,10 @@ func GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
resp := api.GenerateResponse{
|
||||
Model: r.Model,
|
||||
ModelConfiguration: model.Config.ModelConfiguration,
|
||||
CreatedAt: r.CreatedAt,
|
||||
Done: r.Done,
|
||||
Response: r.Content,
|
||||
Model: req.Model,
|
||||
CreatedAt: r.CreatedAt,
|
||||
Done: r.Done,
|
||||
Response: r.Content,
|
||||
Metrics: api.Metrics{
|
||||
TotalDuration: r.TotalDuration,
|
||||
LoadDuration: r.LoadDuration,
|
||||
@@ -290,7 +288,6 @@ func GenerateHandler(c *gin.Context) {
|
||||
|
||||
// Start prediction
|
||||
predictReq := llm.PredictOpts{
|
||||
Model: model.Name,
|
||||
Prompt: prompt,
|
||||
Format: req.Format,
|
||||
CheckpointStart: checkpointStart,
|
||||
@@ -302,19 +299,30 @@ func GenerateHandler(c *gin.Context) {
|
||||
}()
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
// Wait for the channel to close
|
||||
var r api.GenerateResponse
|
||||
// Accumulate responses into the final response
|
||||
var final api.GenerateResponse
|
||||
var sb strings.Builder
|
||||
for resp := range ch {
|
||||
var ok bool
|
||||
if r, ok = resp.(api.GenerateResponse); !ok {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
switch r := resp.(type) {
|
||||
case api.GenerateResponse:
|
||||
sb.WriteString(r.Response)
|
||||
final = r
|
||||
case gin.H:
|
||||
if errorMsg, ok := r["error"].(string); ok {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
|
||||
return
|
||||
} else {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
|
||||
return
|
||||
}
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
|
||||
return
|
||||
}
|
||||
sb.WriteString(r.Response)
|
||||
}
|
||||
r.Response = sb.String()
|
||||
c.JSON(http.StatusOK, r)
|
||||
|
||||
final.Response = sb.String()
|
||||
c.JSON(http.StatusOK, final)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -852,7 +860,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
|
||||
if runtime.GOOS == "linux" {
|
||||
// check compatibility to log warnings
|
||||
if _, err := llm.CheckVRAM(); err != nil {
|
||||
log.Printf(err.Error())
|
||||
log.Print(err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -976,7 +984,7 @@ func ChatHandler(c *gin.Context) {
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
|
||||
resp := api.ChatResponse{
|
||||
Model: r.Model,
|
||||
Model: req.Model,
|
||||
CreatedAt: r.CreatedAt,
|
||||
Done: r.Done,
|
||||
Metrics: api.Metrics{
|
||||
@@ -998,7 +1006,6 @@ func ChatHandler(c *gin.Context) {
|
||||
|
||||
// Start prediction
|
||||
predictReq := llm.PredictOpts{
|
||||
Model: model.Name,
|
||||
Prompt: prompt,
|
||||
Format: req.Format,
|
||||
CheckpointStart: checkpointStart,
|
||||
@@ -1010,21 +1017,33 @@ func ChatHandler(c *gin.Context) {
|
||||
}()
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
// Wait for the channel to close
|
||||
var r api.ChatResponse
|
||||
// Accumulate responses into the final response
|
||||
var final api.ChatResponse
|
||||
var sb strings.Builder
|
||||
for resp := range ch {
|
||||
var ok bool
|
||||
if r, ok = resp.(api.ChatResponse); !ok {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
switch r := resp.(type) {
|
||||
case api.ChatResponse:
|
||||
if r.Message != nil {
|
||||
sb.WriteString(r.Message.Content)
|
||||
}
|
||||
|
||||
final = r
|
||||
case gin.H:
|
||||
if errorMsg, ok := r["error"].(string); ok {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
|
||||
return
|
||||
} else {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
|
||||
return
|
||||
}
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
|
||||
return
|
||||
}
|
||||
if r.Message != nil {
|
||||
sb.WriteString(r.Message.Content)
|
||||
}
|
||||
}
|
||||
r.Message = &api.Message{Role: "assistant", Content: sb.String()}
|
||||
c.JSON(http.StatusOK, r)
|
||||
|
||||
final.Message = &api.Message{Role: "assistant", Content: sb.String()}
|
||||
c.JSON(http.StatusOK, final)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user