oai compat

correct output
rm comments
2024-07-30 11:29:44 -07:00 · 2024-07-29 17:12:39 -07:00 · 2024-07-29 17:02:03 -07:00 · 2024-07-29 16:59:02 -07:00
29 changed files with 76 additions and 166 deletions
--- a/README.md
+++ b/README.md
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
 ### Multimodal models

 ```
-ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
+>>> What's in this image? /Users/jmorgan/Desktop/smile.png
 The image features a yellow smiley face, which is likely the central focus of the picture.
 ```

--- a/api/types.go
+++ b/api/types.go
@@ -267,10 +267,6 @@ type EmbedRequest struct {
 type EmbedResponse struct {
 	Model      string      `json:"model"`
 	Embeddings [][]float32 `json:"embeddings"`
-
-	TotalDuration   time.Duration `json:"total_duration,omitempty"`
-	LoadDuration    time.Duration `json:"load_duration,omitempty"`
-	PromptEvalCount int           `json:"prompt_eval_count,omitempty"`
 }

 // EmbeddingRequest is the request passed to [Client.Embeddings].
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {

 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama3.1",
+		Model:    "llama3",
 		Messages: messages,
 	}

--- a/examples/go-generate-streaming/main.go
+++ b/examples/go-generate-streaming/main.go
@@ -16,7 +16,7 @@ func main() {

 	// By default, GenerateRequest is streaming.
 	req := &api.GenerateRequest{
-		Model:  "gemma2",
+		Model:  "gemma",
 		Prompt: "how many planets are there?",
 	}

--- a/examples/go-generate/main.go
+++ b/examples/go-generate/main.go
@@ -15,7 +15,7 @@ func main() {
 	}

 	req := &api.GenerateRequest{
-		Model:  "gemma2",
+		Model:  "gemma",
 		Prompt: "how many planets are there?",

 		// set streaming to false
--- a/examples/go-http-generate/README.md
+++ b/examples/go-http-generate/README.md
--- a/examples/langchain-python-rag-document/README.md
+++ b/examples/langchain-python-rag-document/README.md
@@ -4,14 +4,6 @@ This example provides an interface for asking questions to a PDF document.

 ## Setup

-1. Ensure you have the `llama3.1` model installed:
-
-```
-ollama pull llama3.1
-```
-
-2. Install the Python Requirements.
-
 ```
 pip install -r requirements.txt
 ```
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
        template=template,
    )

-    llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso

 ## Running the Example

-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama2` model installed:

   ```bash
-   ollama pull llama3.1
+   ollama pull llama2
   ```

 2. Install the Python Requirements.
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain
 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()

-llm = Ollama(model="llama3.1")
+llm = Ollama(model="llama3")
 chain = load_summarize_chain(llm, chain_type="stuff")

-result = chain.invoke(docs)
+result = chain.invoke(docs) 
 print(result)
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.

 ## Running the Example

-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama3.1
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama

 input = input("What is your question?")
-llm = Ollama(model="llama3.1")
+llm = Ollama(model="llama3")
 res = llm.predict(input)
 print (res)
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama3.1
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@

 # Example character: Mario

-This example shows how to create a basic character using Llama3.1 as the base model.
+This example shows how to create a basic character using Llama3 as the base model.

 To run this example:

 1. Download the Modelfile
-2. `ollama pull llama3.1` to get the base model used in the model file.
+2. `ollama pull llama3` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`

@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:

 ```
-FROM llama3.1
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
--- a/examples/python-dockerit/dockerit.py
+++ b/examples/python-dockerit/dockerit.py
@@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ")
 client = docker.from_env()
 s = requests.Session()
 output=""
-with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r:
+with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r:
  for line in r.iter_lines():
    if line:
      j = json.loads(line)
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,7 +2,7 @@ import requests
 import json
 import random

-model = "llama3.1"
+model = "llama3"
 template = {
  "firstName": "",
  "lastName": "",
--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
    "France",
 ]
 country = random.choice(countries)
-model = "llama3.1"
+model = "llama3"

 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran

 ## Running the Example

-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama3.1
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests

 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama3.1"  # TODO: update this for whatever model you wish to use
+model = "llama3"  # TODO: update this for whatever model you wish to use


 def chat(messages):
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam

 ## Running the Example

-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama3.1
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";

-const model = "llama3.1";
+const model = "llama3";
 type Message = {
  role: "assistant" | "user" | "system";
  content: string;
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -69,10 +69,6 @@ func TestAllMiniLMEmbed(t *testing.T) {
 	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}
-
-	if res.PromptEvalCount != 8 {
-		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
-	}
 }

 func TestAllMiniLMBatchEmbed(t *testing.T) {
@@ -101,10 +97,6 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}
-
-	if res.PromptEvalCount != 16 {
-		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
-	}
 }

 func TestAllMiniLMEmbedTruncate(t *testing.T) {
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1221,7 +1221,6 @@ struct llama_server_context
                res.result_json = json
                {
                    {"embedding", std::vector<float>(embd, embd + n_embd)},
-                    {"timings",             slot.get_formated_timings()},
                };
            }
        }
@@ -3204,15 +3203,11 @@ int main(int argc, char **argv) {

                    responses = result.result_json.value("results", std::vector<json>{result.result_json});
                    json embeddings = json::array();
-
-                    int prompt_n = 0;
                    for (auto & elem : responses) {
                        embeddings.push_back(elem.at("embedding"));
-                        prompt_n += elem.at("timings").at("prompt_n").get<int>();
                    }
-
                    // send the result
-                    json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
+                    json embedding_res = json{{"embedding", embeddings}};
                    return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
                }
            });
--- a/llm/patches/10-params.diff
+++ b/llm/patches/10-params.diff
@@ -1,20 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..fba6b175 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
-                 hparams.attn_soft_cap = true;
- 
-                 switch (hparams.n_layer) {
-+                    case 26: model.type = e_model::MODEL_2B; break;
-                     case 42: model.type = e_model::MODEL_9B; break;
-                     case 46: model.type = e_model::MODEL_27B; break;
-                     default: model.type = e_model::MODEL_UNKNOWN;
-@@ -11736,6 +11737,7 @@ struct llm_build_context {
- 
-                 // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
-                 switch (model.type) {
-+                    case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
-                     case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
-                     case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
-                     default: GGML_ABORT("fatal error");
--- a/llm/server.go
+++ b/llm/server.go
@@ -33,7 +33,7 @@ type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embed(ctx context.Context, input []string) (*EmbedResponse, error)
+	Embed(ctx context.Context, input []string) ([][]float32, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
@@ -879,11 +879,10 @@ type EmbedRequest struct {
 }

 type EmbedResponse struct {
-	Embedding       [][]float32 `json:"embedding"`
-	PromptEvalCount int         `json:"prompt_n"`
+	Embedding [][]float32 `json:"embedding"`
 }

-func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
+func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return nil, err
@@ -925,12 +924,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse,
 		return nil, fmt.Errorf("%s", body)
 	}

-	var e EmbedResponse
-	if err := json.Unmarshal(body, &e); err != nil {
+	var embedding EmbedResponse
+	if err := json.Unmarshal(body, &embedding); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}

-	return &e, nil
+	return embedding.Embedding, nil
 }

 type TokenizeRequest struct {
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -192,9 +192,9 @@ func toolCallId() string {
 	return "call_" + strings.ToLower(string(b))
 }

-func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
-	toolCalls := make([]ToolCall, len(r.Message.ToolCalls))
-	for i, tc := range r.Message.ToolCalls {
+func parseToolCalls(respToolCalls []api.ToolCall) []ToolCall {
+	toolCalls := make([]ToolCall, len(respToolCalls))
+	for i, tc := range respToolCalls {
 		toolCalls[i].ID = toolCallId()
 		toolCalls[i].Type = "function"
 		toolCalls[i].Function.Name = tc.Function.Name
@@ -207,6 +207,11 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {

 		toolCalls[i].Function.Arguments = string(args)
 	}
+	return toolCalls
+}
+
+func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
+	toolCalls := parseToolCalls(r.Message.ToolCalls)

 	return ChatCompletion{
 		Id:                id,
@@ -218,9 +223,6 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 			Index:   0,
 			Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
 			FinishReason: func(reason string) *string {
-				if len(toolCalls) > 0 {
-					reason = "tool_calls"
-				}
 				if len(reason) > 0 {
 					return &reason
 				}
@@ -236,6 +238,8 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 }

 func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
+	toolCalls := parseToolCalls(r.Message.ToolCalls)
+
 	return ChatCompletionChunk{
 		Id:                id,
 		Object:            "chat.completion.chunk",
@@ -244,7 +248,7 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 		SystemFingerprint: "fp_ollama",
 		Choices: []ChunkChoice{{
 			Index: 0,
-			Delta: Message{Role: "assistant", Content: r.Message.Content},
+			Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls},
 			FinishReason: func(reason string) *string {
 				if len(reason) > 0 {
 					return &reason
--- a/server/routes.go
+++ b/server/routes.go
@@ -284,7 +284,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 }

 func (s *Server) EmbedHandler(c *gin.Context) {
-	checkpointStart := time.Now()
 	var req api.EmbedRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -333,8 +332,6 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}

-	checkpointLoaded := time.Now()
-
 	kvData, err := getKVData(m.ModelPath, false)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -373,16 +370,13 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}

-	for i, e := range embeddings.Embedding {
-		embeddings.Embedding[i] = normalize(e)
+	for i, e := range embeddings {
+		embeddings[i] = normalize(e)
 	}

 	resp := api.EmbedResponse{
-		Model:           req.Model,
-		Embeddings:      embeddings.Embedding,
-		TotalDuration:   time.Since(checkpointStart),
-		LoadDuration:    checkpointLoaded.Sub(checkpointStart),
-		PromptEvalCount: embeddings.PromptEvalCount,
+		Model:      req.Model,
+		Embeddings: embeddings,
 	}
 	c.JSON(http.StatusOK, resp)
 }
@@ -434,9 +428,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}

-	embedding := make([]float64, len(embeddings.Embedding[0]))
+	embedding := make([]float64, len(embeddings[0]))

-	for i, v := range embeddings.Embedding[0] {
+	for i, v := range embeddings[0] {
 		embedding[i] = float64(v)
 	}

@@ -1375,7 +1369,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 	}()

-	if req.Stream != nil && !*req.Stream {
+	if (req.Stream != nil && !*req.Stream) || ((req.Stream == nil || *req.Stream) && len(req.Tools) > 0) {
 		var resp api.ChatResponse
 		var sb strings.Builder
 		for rr := range ch {
@@ -1406,6 +1400,26 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}
 		}

+		if (req.Stream == nil || *req.Stream) && len(resp.Message.ToolCalls) > 0 {
+			toolCh := make(chan any)
+			go func() {
+				defer close(toolCh)
+				toolCalls := resp.Message.ToolCalls
+				for _, toolCall := range toolCalls {
+					toolCh <- api.ChatResponse{
+						Model:     resp.Model,
+						CreatedAt: resp.CreatedAt,
+						Message:   api.Message{Role: "assistant", ToolCalls: []api.ToolCall{toolCall}},
+					}
+				}
+				resp.Message.ToolCalls = nil
+				resp.DoneReason = "tool_calls"
+				toolCh <- resp
+			}()
+			streamResponse(c, toolCh)
+			return
+		}
+
 		c.JSON(http.StatusOK, resp)
 		return
 	}
--- a/server/sched.go
+++ b/server/sched.go
@@ -212,12 +212,9 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					} else if loadedCount == 0 {
 						// No models loaded. Load the model but prefer the best fit.
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
-						g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
+						g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
 						if g != nil {
 							gpus = g
-						} else {
-							// Only allow partial loads when this is the first model
-							gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
 						}
 						s.loadFn(pending, ggml, gpus, numParallel)
 						break
@@ -234,7 +231,7 @@ func (s *Scheduler) processPending(ctx context.Context) {

 						// Update free memory from currently loaded models
 						s.updateFreeSpace(availGpus)
-						fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
+						fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
 						if fitGpus != nil {
 							slog.Debug("new model fits with existing models, loading")
 							s.loadFn(pending, ggml, fitGpus, numParallel)
@@ -671,12 +668,11 @@ func (a ByDuration) Less(i, j int) bool {
 // func (a BySize) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }

-// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
-// The list of GPUs returned will always be the same brand (library)
+// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // opts.NumCtx accordingly
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
 	var estimatedVRAM uint64

 	var numParallelToTry []int
@@ -727,25 +723,6 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 	return nil
 }

-// If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
-	*numParallel = 1
-	byLibrary := gpus.ByLibrary()
-	if len(byLibrary) <= 1 {
-		return gpus
-	}
-	var bestEstimate uint64
-	var bestFit int
-	for i, gl := range byLibrary {
-		_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
-		if estimatedVRAM > bestEstimate {
-			bestEstimate = estimatedVRAM
-			bestFit = i
-		}
-	}
-	return byLibrary[bestFit]
-}
-
 // findRunnerToUnload finds a runner to unload to make room for a new model
 func (s *Scheduler) findRunnerToUnload() *runnerRef {
 	s.loadedMu.Lock()
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -666,50 +666,11 @@ func TestAlreadyCanceled(t *testing.T) {
 	require.Empty(t, scenario1a.req.successCh)
 }

-func TestHomogeneousGPUs(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
-	defer done()
-	s := InitScheduler(ctx)
-
-	s.getGpuFn = func() gpu.GpuInfoList {
-		// Set memory values to require the model to be spread
-		gpus := []gpu.GpuInfo{
-			{Library: "cuda"},
-			{Library: "rocm"},
-		}
-		gpus[0].TotalMemory = 1 * format.GibiByte
-		gpus[0].FreeMemory = 256 * format.MebiByte
-		gpus[1].TotalMemory = 1 * format.GibiByte
-		gpus[1].FreeMemory = 256 * format.MebiByte
-		return gpus
-	}
-	s.getCpuFn = getCpuFn
-	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
-		require.Len(t, gpus, 1)
-		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
-	}
-	slog.Info("a")
-	s.pendingReqCh <- a.req
-	require.Len(t, s.pendingReqCh, 1)
-	s.Run(ctx)
-	select {
-	case resp := <-a.req.successCh:
-		require.Equal(t, resp.llama, a.srv)
-		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, a.req.errCh)
-	case err := <-a.req.errCh:
-		t.Fatal(err.Error())
-	case <-ctx.Done():
-		t.Fatal("timeout")
-	}
-}
-
 type mockLlm struct {
 	pingResp           error
 	waitResp           error
 	completionResp     error
-	embedResp          *llm.EmbedResponse
+	embedResp          [][]float32
 	embedRespErr       error
 	tokenizeResp       []int
 	tokenizeRespErr    error
@@ -727,7 +688,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
 func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
 	return s.completionResp
 }
-func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
+func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
 	return s.embedResp, s.embedRespErr
 }
 func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
Author	SHA1	Message	Date
Roy Han	f16b3db70c	oai compat	2024-07-30 11:29:44 -07:00
Roy Han	23ff673bdc	correct output	2024-07-29 17:12:39 -07:00
Roy Han	7950053972	rm comments	2024-07-29 17:02:03 -07:00
Roy Han	d2b25c1bfb	draft	2024-07-29 16:59:02 -07:00