Compare commits
4 Commits
v0.3.1
...
royh/strea
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f16b3db70c | ||
|
|
23ff673bdc | ||
|
|
7950053972 | ||
|
|
d2b25c1bfb |
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
|
||||
### Multimodal models
|
||||
|
||||
```
|
||||
ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
|
||||
>>> What's in this image? /Users/jmorgan/Desktop/smile.png
|
||||
The image features a yellow smiley face, which is likely the central focus of the picture.
|
||||
```
|
||||
|
||||
|
||||
@@ -267,10 +267,6 @@ type EmbedRequest struct {
|
||||
type EmbedResponse struct {
|
||||
Model string `json:"model"`
|
||||
Embeddings [][]float32 `json:"embeddings"`
|
||||
|
||||
TotalDuration time.Duration `json:"total_duration,omitempty"`
|
||||
LoadDuration time.Duration `json:"load_duration,omitempty"`
|
||||
PromptEvalCount int `json:"prompt_eval_count,omitempty"`
|
||||
}
|
||||
|
||||
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
||||
|
||||
@@ -35,7 +35,7 @@ func main() {
|
||||
|
||||
ctx := context.Background()
|
||||
req := &api.ChatRequest{
|
||||
Model: "llama3.1",
|
||||
Model: "llama3",
|
||||
Messages: messages,
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ func main() {
|
||||
|
||||
// By default, GenerateRequest is streaming.
|
||||
req := &api.GenerateRequest{
|
||||
Model: "gemma2",
|
||||
Model: "gemma",
|
||||
Prompt: "how many planets are there?",
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ func main() {
|
||||
}
|
||||
|
||||
req := &api.GenerateRequest{
|
||||
Model: "gemma2",
|
||||
Model: "gemma",
|
||||
Prompt: "how many planets are there?",
|
||||
|
||||
// set streaming to false
|
||||
|
||||
0
examples/go-http-generate/README.md
Normal file
0
examples/go-http-generate/README.md
Normal file
@@ -4,14 +4,6 @@ This example provides an interface for asking questions to a PDF document.
|
||||
|
||||
## Setup
|
||||
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
|
||||
```
|
||||
ollama pull llama3.1
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@@ -51,7 +51,7 @@ while True:
|
||||
template=template,
|
||||
)
|
||||
|
||||
llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||
llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||
qa_chain = RetrievalQA.from_chain_type(
|
||||
llm,
|
||||
retriever=vectorstore.as_retriever(),
|
||||
|
||||
@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
1. Ensure you have the `llama2` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3.1
|
||||
ollama pull llama2
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain
|
||||
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
|
||||
docs = loader.load()
|
||||
|
||||
llm = Ollama(model="llama3.1")
|
||||
llm = Ollama(model="llama3")
|
||||
chain = load_summarize_chain(llm, chain_type="stuff")
|
||||
|
||||
result = chain.invoke(docs)
|
||||
result = chain.invoke(docs)
|
||||
print(result)
|
||||
|
||||
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3.1
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from langchain.llms import Ollama
|
||||
|
||||
input = input("What is your question?")
|
||||
llm = Ollama(model="llama3.1")
|
||||
llm = Ollama(model="llama3")
|
||||
res = llm.predict(input)
|
||||
print (res)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM llama3.1
|
||||
FROM llama3
|
||||
PARAMETER temperature 1
|
||||
SYSTEM """
|
||||
You are Mario from super mario bros, acting as an assistant.
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
|
||||
# Example character: Mario
|
||||
|
||||
This example shows how to create a basic character using Llama3.1 as the base model.
|
||||
This example shows how to create a basic character using Llama3 as the base model.
|
||||
|
||||
To run this example:
|
||||
|
||||
1. Download the Modelfile
|
||||
2. `ollama pull llama3.1` to get the base model used in the model file.
|
||||
2. `ollama pull llama3` to get the base model used in the model file.
|
||||
3. `ollama create NAME -f ./Modelfile`
|
||||
4. `ollama run NAME`
|
||||
|
||||
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
|
||||
What the model file looks like:
|
||||
|
||||
```
|
||||
FROM llama3.1
|
||||
FROM llama3
|
||||
PARAMETER temperature 1
|
||||
SYSTEM """
|
||||
You are Mario from Super Mario Bros, acting as an assistant.
|
||||
|
||||
@@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ")
|
||||
client = docker.from_env()
|
||||
s = requests.Session()
|
||||
output=""
|
||||
with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r:
|
||||
with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r:
|
||||
for line in r.iter_lines():
|
||||
if line:
|
||||
j = json.loads(line)
|
||||
|
||||
@@ -2,7 +2,7 @@ import requests
|
||||
import json
|
||||
import random
|
||||
|
||||
model = "llama3.1"
|
||||
model = "llama3"
|
||||
template = {
|
||||
"firstName": "",
|
||||
"lastName": "",
|
||||
|
||||
@@ -12,7 +12,7 @@ countries = [
|
||||
"France",
|
||||
]
|
||||
country = random.choice(countries)
|
||||
model = "llama3.1"
|
||||
model = "llama3"
|
||||
|
||||
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
|
||||
|
||||
|
||||
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3.1
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
import requests
|
||||
|
||||
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
|
||||
model = "llama3.1" # TODO: update this for whatever model you wish to use
|
||||
model = "llama3" # TODO: update this for whatever model you wish to use
|
||||
|
||||
|
||||
def chat(messages):
|
||||
|
||||
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3.1
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import * as readline from "readline";
|
||||
|
||||
const model = "llama3.1";
|
||||
const model = "llama3";
|
||||
type Message = {
|
||||
role: "assistant" | "user" | "system";
|
||||
content: string;
|
||||
|
||||
@@ -69,10 +69,6 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
||||
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
|
||||
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 8 {
|
||||
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||
@@ -101,10 +97,6 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
|
||||
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 16 {
|
||||
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
|
||||
7
llm/ext_server/server.cpp
vendored
7
llm/ext_server/server.cpp
vendored
@@ -1221,7 +1221,6 @@ struct llama_server_context
|
||||
res.result_json = json
|
||||
{
|
||||
{"embedding", std::vector<float>(embd, embd + n_embd)},
|
||||
{"timings", slot.get_formated_timings()},
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -3204,15 +3203,11 @@ int main(int argc, char **argv) {
|
||||
|
||||
responses = result.result_json.value("results", std::vector<json>{result.result_json});
|
||||
json embeddings = json::array();
|
||||
|
||||
int prompt_n = 0;
|
||||
for (auto & elem : responses) {
|
||||
embeddings.push_back(elem.at("embedding"));
|
||||
prompt_n += elem.at("timings").at("prompt_n").get<int>();
|
||||
}
|
||||
|
||||
// send the result
|
||||
json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
|
||||
json embedding_res = json{{"embedding", embeddings}};
|
||||
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index a207451f..fba6b175 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
|
||||
hparams.attn_soft_cap = true;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
+ case 26: model.type = e_model::MODEL_2B; break;
|
||||
case 42: model.type = e_model::MODEL_9B; break;
|
||||
case 46: model.type = e_model::MODEL_27B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
@@ -11736,6 +11737,7 @@ struct llm_build_context {
|
||||
|
||||
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
||||
switch (model.type) {
|
||||
+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
@@ -33,7 +33,7 @@ type LlamaServer interface {
|
||||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||
Embed(ctx context.Context, input []string) (*EmbedResponse, error)
|
||||
Embed(ctx context.Context, input []string) ([][]float32, error)
|
||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||
Close() error
|
||||
@@ -879,11 +879,10 @@ type EmbedRequest struct {
|
||||
}
|
||||
|
||||
type EmbedResponse struct {
|
||||
Embedding [][]float32 `json:"embedding"`
|
||||
PromptEvalCount int `json:"prompt_n"`
|
||||
Embedding [][]float32 `json:"embedding"`
|
||||
}
|
||||
|
||||
func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
|
||||
func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
return nil, err
|
||||
@@ -925,12 +924,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse,
|
||||
return nil, fmt.Errorf("%s", body)
|
||||
}
|
||||
|
||||
var e EmbedResponse
|
||||
if err := json.Unmarshal(body, &e); err != nil {
|
||||
var embedding EmbedResponse
|
||||
if err := json.Unmarshal(body, &embedding); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||||
}
|
||||
|
||||
return &e, nil
|
||||
return embedding.Embedding, nil
|
||||
}
|
||||
|
||||
type TokenizeRequest struct {
|
||||
|
||||
@@ -192,9 +192,9 @@ func toolCallId() string {
|
||||
return "call_" + strings.ToLower(string(b))
|
||||
}
|
||||
|
||||
func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
||||
toolCalls := make([]ToolCall, len(r.Message.ToolCalls))
|
||||
for i, tc := range r.Message.ToolCalls {
|
||||
func parseToolCalls(respToolCalls []api.ToolCall) []ToolCall {
|
||||
toolCalls := make([]ToolCall, len(respToolCalls))
|
||||
for i, tc := range respToolCalls {
|
||||
toolCalls[i].ID = toolCallId()
|
||||
toolCalls[i].Type = "function"
|
||||
toolCalls[i].Function.Name = tc.Function.Name
|
||||
@@ -207,6 +207,11 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
||||
|
||||
toolCalls[i].Function.Arguments = string(args)
|
||||
}
|
||||
return toolCalls
|
||||
}
|
||||
|
||||
func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
||||
toolCalls := parseToolCalls(r.Message.ToolCalls)
|
||||
|
||||
return ChatCompletion{
|
||||
Id: id,
|
||||
@@ -218,9 +223,6 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
||||
Index: 0,
|
||||
Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
|
||||
FinishReason: func(reason string) *string {
|
||||
if len(toolCalls) > 0 {
|
||||
reason = "tool_calls"
|
||||
}
|
||||
if len(reason) > 0 {
|
||||
return &reason
|
||||
}
|
||||
@@ -236,6 +238,8 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
||||
}
|
||||
|
||||
func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
|
||||
toolCalls := parseToolCalls(r.Message.ToolCalls)
|
||||
|
||||
return ChatCompletionChunk{
|
||||
Id: id,
|
||||
Object: "chat.completion.chunk",
|
||||
@@ -244,7 +248,7 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
|
||||
SystemFingerprint: "fp_ollama",
|
||||
Choices: []ChunkChoice{{
|
||||
Index: 0,
|
||||
Delta: Message{Role: "assistant", Content: r.Message.Content},
|
||||
Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls},
|
||||
FinishReason: func(reason string) *string {
|
||||
if len(reason) > 0 {
|
||||
return &reason
|
||||
|
||||
@@ -284,7 +284,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
checkpointStart := time.Now()
|
||||
var req api.EmbedRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -333,8 +332,6 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
checkpointLoaded := time.Now()
|
||||
|
||||
kvData, err := getKVData(m.ModelPath, false)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
@@ -373,16 +370,13 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
for i, e := range embeddings.Embedding {
|
||||
embeddings.Embedding[i] = normalize(e)
|
||||
for i, e := range embeddings {
|
||||
embeddings[i] = normalize(e)
|
||||
}
|
||||
|
||||
resp := api.EmbedResponse{
|
||||
Model: req.Model,
|
||||
Embeddings: embeddings.Embedding,
|
||||
TotalDuration: time.Since(checkpointStart),
|
||||
LoadDuration: checkpointLoaded.Sub(checkpointStart),
|
||||
PromptEvalCount: embeddings.PromptEvalCount,
|
||||
Model: req.Model,
|
||||
Embeddings: embeddings,
|
||||
}
|
||||
c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
@@ -434,9 +428,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
embedding := make([]float64, len(embeddings.Embedding[0]))
|
||||
embedding := make([]float64, len(embeddings[0]))
|
||||
|
||||
for i, v := range embeddings.Embedding[0] {
|
||||
for i, v := range embeddings[0] {
|
||||
embedding[i] = float64(v)
|
||||
}
|
||||
|
||||
@@ -1375,7 +1369,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
}()
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
if (req.Stream != nil && !*req.Stream) || ((req.Stream == nil || *req.Stream) && len(req.Tools) > 0) {
|
||||
var resp api.ChatResponse
|
||||
var sb strings.Builder
|
||||
for rr := range ch {
|
||||
@@ -1406,6 +1400,26 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
if (req.Stream == nil || *req.Stream) && len(resp.Message.ToolCalls) > 0 {
|
||||
toolCh := make(chan any)
|
||||
go func() {
|
||||
defer close(toolCh)
|
||||
toolCalls := resp.Message.ToolCalls
|
||||
for _, toolCall := range toolCalls {
|
||||
toolCh <- api.ChatResponse{
|
||||
Model: resp.Model,
|
||||
CreatedAt: resp.CreatedAt,
|
||||
Message: api.Message{Role: "assistant", ToolCalls: []api.ToolCall{toolCall}},
|
||||
}
|
||||
}
|
||||
resp.Message.ToolCalls = nil
|
||||
resp.DoneReason = "tool_calls"
|
||||
toolCh <- resp
|
||||
}()
|
||||
streamResponse(c, toolCh)
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, resp)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -212,12 +212,9 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
} else if loadedCount == 0 {
|
||||
// No models loaded. Load the model but prefer the best fit.
|
||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||
g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
|
||||
if g != nil {
|
||||
gpus = g
|
||||
} else {
|
||||
// Only allow partial loads when this is the first model
|
||||
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||
}
|
||||
s.loadFn(pending, ggml, gpus, numParallel)
|
||||
break
|
||||
@@ -234,7 +231,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
|
||||
// Update free memory from currently loaded models
|
||||
s.updateFreeSpace(availGpus)
|
||||
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
|
||||
fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
|
||||
if fitGpus != nil {
|
||||
slog.Debug("new model fits with existing models, loading")
|
||||
s.loadFn(pending, ggml, fitGpus, numParallel)
|
||||
@@ -671,12 +668,11 @@ func (a ByDuration) Less(i, j int) bool {
|
||||
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
||||
|
||||
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||
// The list of GPUs returned will always be the same brand (library)
|
||||
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
||||
// opts.NumCtx accordingly
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
var estimatedVRAM uint64
|
||||
|
||||
var numParallelToTry []int
|
||||
@@ -727,25 +723,6 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
||||
return nil
|
||||
}
|
||||
|
||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
*numParallel = 1
|
||||
byLibrary := gpus.ByLibrary()
|
||||
if len(byLibrary) <= 1 {
|
||||
return gpus
|
||||
}
|
||||
var bestEstimate uint64
|
||||
var bestFit int
|
||||
for i, gl := range byLibrary {
|
||||
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||
if estimatedVRAM > bestEstimate {
|
||||
bestEstimate = estimatedVRAM
|
||||
bestFit = i
|
||||
}
|
||||
}
|
||||
return byLibrary[bestFit]
|
||||
}
|
||||
|
||||
// findRunnerToUnload finds a runner to unload to make room for a new model
|
||||
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
||||
s.loadedMu.Lock()
|
||||
|
||||
@@ -666,50 +666,11 @@ func TestAlreadyCanceled(t *testing.T) {
|
||||
require.Empty(t, scenario1a.req.successCh)
|
||||
}
|
||||
|
||||
func TestHomogeneousGPUs(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
// Set memory values to require the model to be spread
|
||||
gpus := []gpu.GpuInfo{
|
||||
{Library: "cuda"},
|
||||
{Library: "rocm"},
|
||||
}
|
||||
gpus[0].TotalMemory = 1 * format.GibiByte
|
||||
gpus[0].FreeMemory = 256 * format.MebiByte
|
||||
gpus[1].TotalMemory = 1 * format.GibiByte
|
||||
gpus[1].FreeMemory = 256 * format.MebiByte
|
||||
return gpus
|
||||
}
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
require.Len(t, gpus, 1)
|
||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||
}
|
||||
slog.Info("a")
|
||||
s.pendingReqCh <- a.req
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
s.Run(ctx)
|
||||
select {
|
||||
case resp := <-a.req.successCh:
|
||||
require.Equal(t, resp.llama, a.srv)
|
||||
require.Empty(t, s.pendingReqCh)
|
||||
require.Empty(t, a.req.errCh)
|
||||
case err := <-a.req.errCh:
|
||||
t.Fatal(err.Error())
|
||||
case <-ctx.Done():
|
||||
t.Fatal("timeout")
|
||||
}
|
||||
}
|
||||
|
||||
type mockLlm struct {
|
||||
pingResp error
|
||||
waitResp error
|
||||
completionResp error
|
||||
embedResp *llm.EmbedResponse
|
||||
embedResp [][]float32
|
||||
embedRespErr error
|
||||
tokenizeResp []int
|
||||
tokenizeRespErr error
|
||||
@@ -727,7 +688,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
|
||||
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
||||
return s.completionResp
|
||||
}
|
||||
func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
|
||||
func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
|
||||
return s.embedResp, s.embedRespErr
|
||||
}
|
||||
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
|
||||
Reference in New Issue
Block a user