Compare commits
9 Commits
timeout
...
bmizerany/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1963c00201 | ||
|
|
27402cb7a2 | ||
|
|
c1218199cf | ||
|
|
717f7229eb | ||
|
|
5f034f5b63 | ||
|
|
b910fa9010 | ||
|
|
6d4219083c | ||
|
|
1ed4f521c4 | ||
|
|
de2163dafd |
@@ -292,6 +292,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
|
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
|
||||||
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
|
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
|
||||||
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
|
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
|
||||||
|
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
|
|||||||
36
cmd/cmd.go
36
cmd/cmd.go
@@ -624,13 +624,13 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
|
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
|
||||||
}
|
}
|
||||||
|
|
||||||
if flagsSet == 1 {
|
req := api.ShowRequest{Name: args[0]}
|
||||||
req := api.ShowRequest{Name: args[0]}
|
resp, err := client.Show(cmd.Context(), &req)
|
||||||
resp, err := client.Show(cmd.Context(), &req)
|
if err != nil {
|
||||||
if err != nil {
|
return err
|
||||||
return err
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
if flagsSet == 1 {
|
||||||
switch showType {
|
switch showType {
|
||||||
case "license":
|
case "license":
|
||||||
fmt.Println(resp.License)
|
fmt.Println(resp.License)
|
||||||
@@ -647,12 +647,12 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
req := api.ShowRequest{Name: args[0]}
|
showInfo(resp)
|
||||||
resp, err := client.Show(cmd.Context(), &req)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func showInfo(resp *api.ShowResponse) {
|
||||||
arch := resp.ModelInfo["general.architecture"].(string)
|
arch := resp.ModelInfo["general.architecture"].(string)
|
||||||
|
|
||||||
modelData := [][]string{
|
modelData := [][]string{
|
||||||
@@ -672,11 +672,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
projectorData := [][]string{
|
projectorData := [][]string{
|
||||||
{"arch", "clip"},
|
{"arch", "clip"},
|
||||||
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
||||||
{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
|
|
||||||
{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
|
||||||
{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
|
||||||
|
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
|
||||||
|
}
|
||||||
|
|
||||||
|
projectorData = append(projectorData,
|
||||||
|
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
||||||
|
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
||||||
|
)
|
||||||
|
|
||||||
mainTableData = append(mainTableData,
|
mainTableData = append(mainTableData,
|
||||||
[]string{"Projector"},
|
[]string{"Projector"},
|
||||||
[]string{renderSubTable(projectorData, false)},
|
[]string{renderSubTable(projectorData, false)},
|
||||||
@@ -705,8 +711,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
table.Render()
|
table.Render()
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderSubTable(data [][]string, file bool) string {
|
func renderSubTable(data [][]string, file bool) string {
|
||||||
|
|||||||
@@ -404,15 +404,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||||||
|
|
||||||
switch args[1] {
|
switch args[1] {
|
||||||
case "info":
|
case "info":
|
||||||
fmt.Println("Model details:")
|
showInfo(resp)
|
||||||
if len(resp.Details.Families) > 0 {
|
|
||||||
fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", "))
|
|
||||||
} else if resp.Details.Family != "" {
|
|
||||||
fmt.Printf("Family %s\n", resp.Details.Family)
|
|
||||||
}
|
|
||||||
fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize)
|
|
||||||
fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel)
|
|
||||||
fmt.Println("")
|
|
||||||
case "license":
|
case "license":
|
||||||
if resp.License == "" {
|
if resp.License == "" {
|
||||||
fmt.Println("No license was specified for this model.")
|
fmt.Println("No license was specified for this model.")
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ All durations are returned in nanoseconds.
|
|||||||
|
|
||||||
### Streaming responses
|
### Streaming responses
|
||||||
|
|
||||||
Certain endpoints stream responses as JSON objects and can optional return non-streamed responses.
|
Certain endpoints stream responses as JSON objects. Streaming can be disabled by providing `{"stream": false}` for these endpoints.
|
||||||
|
|
||||||
## Generate a completion
|
## Generate a completion
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ Check your compute compatibility to see if your card is supported:
|
|||||||
| | Quadro | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000` |
|
| | Quadro | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000` |
|
||||||
| 7.0 | NVIDIA | `TITAN V` `V100` `Quadro GV100` |
|
| 7.0 | NVIDIA | `TITAN V` `V100` `Quadro GV100` |
|
||||||
| 6.1 | NVIDIA TITAN | `TITAN Xp` `TITAN X` |
|
| 6.1 | NVIDIA TITAN | `TITAN Xp` `TITAN X` |
|
||||||
| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050` |
|
| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050 Ti` `GTX 1050` |
|
||||||
| | Quadro | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` |
|
| | Quadro | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` |
|
||||||
| | Tesla | `P40` `P4` |
|
| | Tesla | `P40` `P4` |
|
||||||
| 6.0 | NVIDIA | `Tesla P100` `Quadro GP100` |
|
| 6.0 | NVIDIA | `Tesla P100` `Quadro GP100` |
|
||||||
|
|||||||
@@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \
|
|||||||
|
|
||||||
#### Notes
|
#### Notes
|
||||||
|
|
||||||
- `finish_reason` will always be `stop`
|
|
||||||
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
|
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
|
||||||
|
|
||||||
## Models
|
## Models
|
||||||
|
|||||||
46
llm/ext_server/server.cpp
vendored
46
llm/ext_server/server.cpp
vendored
@@ -1650,26 +1650,41 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||||
|
|
||||||
|
char buf[256];
|
||||||
|
llama_model_meta_val_str(model, "general.architecture", buf, 256);
|
||||||
|
bool gemma2 = strcmp(buf, "gemma2") == 0;
|
||||||
|
|
||||||
|
int32_t truncate_at = slot.n_ctx;
|
||||||
|
|
||||||
|
// truncate at 2/3 of the context length for gemma2 models
|
||||||
|
// as they do not support context shifts (from the sliding window implementation).
|
||||||
|
// this way, prompts that almost fit the context length can still generate a full
|
||||||
|
// response without a sudden stop from hitting the context limit
|
||||||
|
if (gemma2) {
|
||||||
|
truncate_at = 2 * slot.n_ctx / 3;
|
||||||
|
}
|
||||||
|
|
||||||
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
||||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
|
if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
|
||||||
{
|
{
|
||||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||||
const int n_block_size = n_left / 2;
|
const int n_shift = n_left / 2;
|
||||||
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift;
|
||||||
|
|
||||||
std::vector<llama_token> new_tokens(
|
std::vector<llama_token> new_tokens(
|
||||||
prompt_tokens.begin(),
|
prompt_tokens.begin(),
|
||||||
prompt_tokens.begin() + slot.params.n_keep);
|
prompt_tokens.begin() + slot.params.n_keep);
|
||||||
new_tokens.insert(
|
new_tokens.insert(
|
||||||
new_tokens.end(),
|
new_tokens.end(),
|
||||||
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
prompt_tokens.begin() + slot.params.n_keep + n_erase,
|
||||||
prompt_tokens.end());
|
prompt_tokens.end());
|
||||||
|
|
||||||
LOG_VERBOSE("input truncated", {
|
LOG_INFO("input truncated", {
|
||||||
{"n_ctx", slot.n_ctx},
|
{"n_ctx", slot.n_ctx},
|
||||||
{"n_keep", slot.params.n_keep},
|
{"n_keep", slot.params.n_keep},
|
||||||
{"n_left", n_left},
|
{"n_left", n_left},
|
||||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
{"n_shift", n_shift},
|
||||||
|
{"n_erase", n_erase},
|
||||||
});
|
});
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
prompt_tokens = new_tokens;
|
prompt_tokens = new_tokens;
|
||||||
@@ -1678,6 +1693,19 @@ struct llama_server_context
|
|||||||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Models with sliding window attention do not work with context shifts, so
|
||||||
|
// limit their prediction to the context length
|
||||||
|
if (gemma2) {
|
||||||
|
int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
|
||||||
|
slot.n_predict = limit;
|
||||||
|
slot.params.n_predict = limit;
|
||||||
|
LOG_INFO("model does not support sliding window, limiting generation", {
|
||||||
|
{"n_ctx", slot.n_ctx},
|
||||||
|
{"n_prompt_tokens", slot.n_prompt_tokens},
|
||||||
|
{"n_predict", slot.n_predict}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (!slot.params.cache_prompt)
|
if (!slot.params.cache_prompt)
|
||||||
{
|
{
|
||||||
llama_sampling_reset(slot.ctx_sampling);
|
llama_sampling_reset(slot.ctx_sampling);
|
||||||
|
|||||||
15
llm/ggml.go
15
llm/ggml.go
@@ -366,9 +366,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
case "gemma":
|
case "gemma", "gemma2":
|
||||||
fullOffload = 4 * batch * (embedding + vocab)
|
fullOffload = max(
|
||||||
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
|
4*batch*(embedding+vocab),
|
||||||
|
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
|
||||||
|
)
|
||||||
|
|
||||||
|
partialOffload = max(
|
||||||
|
4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
|
||||||
|
4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
|
||||||
|
4*embeddingHeadsK*context*8+
|
||||||
|
embedding*embeddingHeadsK*heads*9/16,
|
||||||
|
)
|
||||||
case "command-r":
|
case "command-r":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
|
|||||||
@@ -166,8 +166,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
|
|
||||||
params = append(params, "--log-disable")
|
params = append(params, "--log-disable")
|
||||||
|
|
||||||
params = append(params, "--timeout", fmt.Sprintf("%d", 600))
|
|
||||||
|
|
||||||
if opts.NumGPU >= 0 {
|
if opts.NumGPU >= 0 {
|
||||||
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user