disable execstack for amd libraries

2024-03-10 15:08:46 -07:00
26 changed files with 121 additions and 337 deletions
--- a/6
+++ b/6
@@ -42,7 +42,7 @@ ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 RUN mkdir /tmp/scratch && \
    for dep in $(cat /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
+    cp ${dep} /tmp/scratch/ || exit 1 ; \
    done && \
    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
    mkdir -p /go/src/github.com/jmorganca/ollama/dist/deps/ && \
@@ -92,7 +92,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/b
 COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build .
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -103,7 +103,7 @@ COPY . .
 COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build .
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -1,50 +0,0 @@
 package api
 import (
 	"encoding/json"
 	"math"
 	"testing"
 	"time"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 func TestKeepAliveParsingFromJSON(t *testing.T) {
 	tests := []struct {
 		name string
 		req  string
 		exp  *Duration
 	}{
 		{
 			name: "Positive Integer",
 			req:  `{ "keep_alive": 42 }`,
 			exp:  &Duration{42 * time.Second},
 		},
 		{
 			name: "Positive Integer String",
 			req:  `{ "keep_alive": "42m" }`,
 			exp:  &Duration{42 * time.Minute},
 		},
 		{
 			name: "Negative Integer",
 			req:  `{ "keep_alive": -1 }`,
 			exp:  &Duration{math.MaxInt64},
 		},
 		{
 			name: "Negative Integer String",
 			req:  `{ "keep_alive": "-1m" }`,
 			exp:  &Duration{math.MaxInt64},
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			var dec ChatRequest
 			err := json.Unmarshal([]byte(test.req), &dec)
 			require.NoError(t, err)
 			assert.Equal(t, test.exp, dec.KeepAlive)
 		})
 	}
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -900,7 +900,8 @@ func NewCLI() *cobra.Command {
 	cobra.EnableCommandSorting = false
 	if runtime.GOOS == "windows" {
-		console.ConsoleFromFile(os.Stdin) //nolint:errcheck
+		// Enable colorful ANSI escape code in Windows terminal (disabled by default)
 		console.ConsoleFromFile(os.Stdout) //nolint:errcheck
 	}
 	rootCmd := &cobra.Command{
@@ -969,10 +970,9 @@ func NewCLI() *cobra.Command {
 	serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
 Environment Variables:
-    OLLAMA_HOST         The host:port to bind to (default "127.0.0.1:11434")
+    OLLAMA_HOST       The host:port to bind to (default "127.0.0.1:11434")
-    OLLAMA_ORIGINS      A comma separated list of allowed origins.
+    OLLAMA_ORIGINS    A comma separated list of allowed origins.
-    OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
+    OLLAMA_MODELS     The path to the models directory (default is "~/.ollama/models")
    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
 `)
 	pullCmd := &cobra.Command{
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -103,9 +103,9 @@ func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
 			return []llm.Tensor{}, 0, err
 		}
-		shape := []uint64{0, 0, 0, 0}
+		shape := [4]uint64{1, 1, 1, 1}
-		for i := range data.Shape {
+		for cnt, s := range data.Shape {
-			shape[i] = uint64(data.Shape[i])
+			shape[cnt] = uint64(s)
 		}
 		t := llm.Tensor{
--- a/docs/README.md
+++ b/docs/README.md
@@ -3,7 +3,7 @@
 ### Getting Started
 * [Quickstart](../README.md#quickstart)
 * [Examples](../examples)
-* [Importing models](./import.md)
+* [Importing models](./import.md) from GGUF, Pytorch and Safetensors
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -193,13 +193,3 @@ To unload the model and free up memory use:
 ```shell
 curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
 ```
 ## Controlling which GPUs to use
 By default, on Linux and Windows, Ollama will attempt to use Nvidia GPUs, or
 Radeon GPUs, and will use all the GPUs it can find. You can limit which GPUs
 will be utilized by setting the environment variable `CUDA_VISIBLE_DEVICES` for
 NVIDIA cards, or `HIP_VISIBLE_DEVICES` for Radeon GPUs to a comma delimited list
 of GPU IDs.  You can see the list of devices with GPU tools such as `nvidia-smi` or
 `rocminfo`. You can set to an invalid GPU ID (e.g., "-1") to bypass the GPU and
 fallback to CPU.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -131,7 +131,7 @@ The `PARAMETER` instruction defines a parameter that can be set when the model i
 PARAMETER <parameter> <parametervalue>
 ```
-#### Valid Parameters and Values
+### Valid Parameters and Values
 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
@@ -201,22 +201,7 @@ LICENSE """
 ### MESSAGE
-The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
+The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
 ```modelfile
 MESSAGE <role> <message>
 ```
 #### Valid roles
 | Role      | Description                                                  |
 | --------- | ------------------------------------------------------------ |
 | system    | Alternate way of providing the SYSTEM message for the model. |
 | user      | An example message of what the user could have asked.        |
 | assistant | An example message of how the model should respond.          |
 #### Example conversation
 ```modelfile
 MESSAGE user Is Toronto in Canada?
@@ -227,7 +212,6 @@ MESSAGE user Is Ontario in Canada?
 MESSAGE assistant yes
 ```
 ## Notes
 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -109,3 +109,7 @@ which version to install.
 ```sh
 curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
 ```
 ## Known issues
 * N/A
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -40,17 +40,19 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
 	// TODO - does sort order matter?
 	devices := []string{}
 	for i := range ids {
 		slog.Debug(fmt.Sprintf("i=%d", i))
 		if _, skipped := skip[i]; skipped {
 			slog.Debug("skipped")
 			continue
 		}
 		devices = append(devices, strconv.Itoa(i))
 	}
 	slog.Debug(fmt.Sprintf("devices=%v", devices))
 	val := strings.Join(devices, ",")
 	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
 	if err != nil {
 		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
 	} else {
 		slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
 	}
 	slog.Debug("HIP_VISIBLE_DEVICES=" + val)
 }
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -24,9 +24,6 @@ const (
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
 	RocmStandardLocation   = "/opt/rocm/lib"
 	// TODO find a better way to detect iGPU instead of minimum memory
 	IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
 )
 var (
@@ -149,8 +146,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 	resp.memInfo.DeviceCount = 0
 	resp.memInfo.TotalMemory = 0
 	resp.memInfo.FreeMemory = 0
 	slog.Debug("discovering VRAM for amdgpu devices")
 	if len(ids) == 0 {
 		slog.Debug("discovering all amdgpu devices")
 		entries, err := os.ReadDir(AMDNodesSysfsDir)
 		if err != nil {
 			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
@@ -168,7 +165,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			ids = append(ids, id)
 		}
 	}
-	slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
+	slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
 	for _, id := range ids {
 		if _, skipped := skip[id]; skipped {
@@ -176,8 +173,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 		}
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
-		// Adjust for sysfs vs HIP ids
+		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
 		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
 		propFiles, err := filepath.Glob(propGlob)
 		if err != nil {
 			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
@@ -209,13 +205,6 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 		}
 		if totalMemory == 0 {
 			slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
 			skip[id] = struct{}{}
 			continue
 		}
 		if totalMemory < IGPUMemLimit {
 			slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
 			skip[id] = struct{}{}
 			continue
 		}
 		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
@@ -243,8 +232,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 			usedMemory += used
 		}
-		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
+		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
-		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %dM", id, (totalMemory-usedMemory)/1024/1024))
+		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %d", id, (totalMemory - usedMemory)))
 		resp.memInfo.DeviceCount++
 		resp.memInfo.TotalMemory += totalMemory
 		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
@@ -293,7 +282,7 @@ func AMDValidateLibDir() (string, error) {
 	}
 	// If we already have a rocm dependency wired, nothing more to do
-	rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
+	rocmTargetDir := filepath.Join(payloadsDir, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		return rocmTargetDir, nil
 	}
@@ -369,8 +358,6 @@ func AMDDriverVersion() (string, error) {
 }
 func AMDGFXVersions() map[int]Version {
 	// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	res := map[int]Version{}
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
 	for _, match := range matches {
@@ -386,20 +373,17 @@ func AMDGFXVersions() map[int]Version {
 			continue
 		}
 		if i == 0 {
 			// Skipping the CPU
 			continue
 		}
 		// Align with HIP IDs (zero is first GPU, not CPU)
 		i -= 1
 		scanner := bufio.NewScanner(fp)
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			if strings.HasPrefix(line, "gfx_target_version") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 || len(ver[1]) < 5 {
-					if ver[1] != "0" {
+
 					if ver[1] == "0" {
 						// Silently skip the CPU
 						continue
 					} else {
 						slog.Debug("malformed " + line)
 					}
 					res[i] = Version{
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -23,9 +23,7 @@ func PayloadsDir() (string, error) {
 		if err != nil {
 			return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 		}
-		// We create a distinct subdirectory for payloads within the tmpdir
+		payloadsDir = tmpDir
 		// This will typically look like /tmp/ollama3208993108/runners on linux
 		payloadsDir = filepath.Join(tmpDir, "runners")
 	}
 	return payloadsDir, nil
 }
@@ -34,12 +32,10 @@ func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
 	if payloadsDir != "" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
+		slog.Debug("cleaning up", "dir", payloadsDir)
-		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
+		err := os.RemoveAll(payloadsDir)
 		slog.Debug("cleaning up", "dir", tmpDir)
 		err := os.RemoveAll(tmpDir)
 		if err != nil {
-			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
+			slog.Warn("failed to clean up", "dir", payloadsDir, "err", err)
 		}
 	}
 }
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -155,8 +155,8 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
      }
    }
-    LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
-    LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);
    resp->total += memInfo.total;
    resp->free += memInfo.free;
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -149,7 +149,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 	slog.Info("Initializing llama server")
 	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
-	initResp := newExtServerResp(512)
+	initResp := newExtServerResp(128)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
 	if initResp.id < 0 {
@@ -198,9 +198,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
 		if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
 			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
 		}
 	}
 	retryDelay := 100 * time.Microsecond
@@ -228,14 +225,17 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		}
 		retryNeeded := false
 		// keep track of the last token generated, this is used to abort if the model starts looping
 		var lastToken string
 		var tokenRepeat int
 	out:
 		for {
 			select {
 			case <-ctx.Done():
-				return cancelCompletion(llm, resp)
+				// This handles the request cancellation
 				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
 				if resp.id < 0 {
 					return extServerResponseToErr(resp)
 				} else {
 					return nil
 				}
 			default:
 				var result C.ext_server_task_result_t
 				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
@@ -258,20 +258,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 					break out
 				}
 				switch {
 				case strings.TrimSpace(p.Content) == lastToken:
 					tokenRepeat++
 				default:
 					lastToken = strings.TrimSpace(p.Content)
 					tokenRepeat = 0
 				}
 				// 30 picked as an arbitrary max token repeat limit, modify as needed
 				if tokenRepeat > 30 {
 					slog.Debug("prediction aborted, token repeat limit reached")
 					return cancelCompletion(llm, resp)
 				}
 				if p.Content != "" {
 					fn(PredictResult{
 						Content: p.Content,
@@ -299,15 +285,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 	return fmt.Errorf("max retries exceeded")
 }
 func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
 	C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
 	if resp.id < 0 {
 		return extServerResponseToErr(resp)
 	} else {
 		return nil
 	}
 }
 func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -26,7 +26,7 @@
 #endif // GGML_USE_CUBLAS
 // Expose the llama server as a callable extern "C" API
-llama_server_context *llama = NULL;
+server_context *llama = NULL;
 std::thread ext_server_thread;
 bool shutting_down = false;
 std::atomic_int recv_counter;
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
  err->id = 0;
  err->msg[0] = '\0';
  try {
-    llama = new llama_server_context;
+    llama = new server_context;
    gpt_params params;
    params.n_ctx = sparams->n_ctx;
    params.n_batch = sparams->n_batch;
@@ -114,14 +114,18 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
    llama_backend_init();
    llama_numa_init(params.numa);
-  if (!llama->load_model(params)) { 
+    // load the model
-    // an error occurred that was not thrown
+    if (!llama->load_model(params)) {
-    err->id = -1;
+      // TODO - consider modifying the logging logic or patching load_model so
-    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+      // we can capture more detailed error messages and pass them back to the
-    return;
+      // caller for better UX
-  }
+      err->id = -1;
      snprintf(err->msg, err->msg_len, "error loading model %s",
               params.model.c_str());
      return;
    }
-    llama->initialize();
+    llama->init();
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
@@ -140,13 +144,13 @@ void llama_server_start() {
      LOG_TEE("llama server main loop starting\n");
      ggml_time_init();
      llama->queue_tasks.on_new_task(std::bind(
-        &llama_server_context::process_single_task, llama, std::placeholders::_1));
+        &server_context::process_single_task, llama, std::placeholders::_1));
      llama->queue_tasks.on_finish_multitask(std::bind(
-        &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
+        &server_context::on_finish_multitask, llama, std::placeholders::_1));
      llama->queue_tasks.on_run_slots(std::bind(
-        &llama_server_context::update_slots, llama));
+        &server_context::update_slots, llama));
      llama->queue_results.on_multitask_update(std::bind(
-          &llama_server_queue::update_multitask,
+          &server_queue::update_multitask,
          &llama->queue_tasks,
          std::placeholders::_1,
          std::placeholders::_2,
@@ -194,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
    json data = json::parse(json_req);
    resp->id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(resp->id);
-    llama->request_completion(resp->id, data, false, false, -1);
+    llama->request_completion(resp->id, -1, data, false, false);
  } catch (std::exception &e) {
    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
  } catch (...) {
@@ -212,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
  std::string result_json;
  try {
    atomicRecv ar(recv_counter);
-    task_result result = llama->queue_results.recv(task_id);
+    server_task_result result = llama->queue_results.recv(task_id);
    result_json =
-        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+        result.data.dump(-1, ' ', false, json::error_handler_t::replace);
    resp->id = result.id;
    resp->stop = result.stop;
    resp->error = result.error;
@@ -359,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
    }
    const int task_id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(task_id);
-    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
+    llama->request_completion(task_id, -1, {{"prompt", prompt}, {"n_predict", 0}}, false, true);
    atomicRecv ar(recv_counter);
-    task_result result = llama->queue_results.recv(task_id);
+    server_task_result result = llama->queue_results.recv(task_id);
-    std::string result_json = result.result_json.dump();
+    std::string result_json = result.data.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,6 +18,19 @@ sign() {
    fi
 }
 # bundle_metal bundles ggml-common.h and ggml-metal.metal into a single file
 bundle_metal() {
    grep -v '#include "ggml-common.h"' "${LLAMACPP_DIR}/ggml-metal.metal" | grep -v '#pragma once' > "${LLAMACPP_DIR}/ggml-metal.metal.temp"
    echo '#define GGML_COMMON_IMPL_METAL' > "${LLAMACPP_DIR}/ggml-metal.metal"
    cat "${LLAMACPP_DIR}/ggml-common.h" | grep -v '#pragma once' >> "${LLAMACPP_DIR}/ggml-metal.metal"
    cat  "${LLAMACPP_DIR}/ggml-metal.metal.temp" >> "${LLAMACPP_DIR}/ggml-metal.metal"
    rm "${LLAMACPP_DIR}/ggml-metal.metal.temp"
 }
 cleanup_metal() {
    (cd ${LLAMACPP_DIR} && git checkout ggml-metal.metal)
 }
 COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
 case "${GOARCH}" in
@@ -63,9 +76,11 @@ case "${GOARCH}" in
    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
    bundle_metal
    build
    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
    compress_libs
    cleanup_metal
    ;;
 *)
    echo "GOARCH must be set"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -185,21 +185,19 @@ if [ -d "${ROCM_PATH}" ]; then
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build
    # Record the ROCM dependencies
    rm -f "${BUILD_DIR}/lib/deps.txt"
    touch "${BUILD_DIR}/lib/deps.txt"
    # having the execstack bit set on the HIP runtime sometimes causes `ldd` to error
    execstack -c "${ROCM_PATH}/lib/libamdhip64.so*"
    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
    done
    # bomb out if for some reason we didn't get a few deps
    if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
        cat "${BUILD_DIR}/lib/deps.txt"
        echo "ERROR: deps file short"
        exit 1
    fi
    compress_libs
 fi
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@@ -1,21 +1,19 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 8fe5e0b1..3e82acb9 100644
+index f255ad76..914ecfdd 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -997,13 +997,15 @@ struct llama_server_context
+@@ -1101,12 +1101,13 @@ struct server_context {
                 slot.n_sent_text += result.text_to_send.size();
                 // add the token to slot queue and cache
             }
 -            slot.add_token_string(result);
-+
+             if (slot.params.stream) {
             if (slot.params.stream)
             {
                 send_partial_response(slot, result);
             }
         }
 +        slot.add_token_string(result);
 +
-         if (incomplete)
+         if (incomplete) {
         {
             slot.has_next_token = true;
         }
--- a/llm/patches/02-cudaleaks.diff
+++ b/llm/patches/02-cudaleaks.diff
@@ -1,10 +1,10 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 8fe5e0b1..53bf39c1 100644
+index b14cca61..02bfd4b1 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -31,6 +31,10 @@
+@@ -29,6 +29,10 @@
 #include <atomic>
 #include <signal.h>
 #include <memory>
 +#ifdef GGML_USE_CUBLAS
 +extern "C" GGML_CALL void ggml_free_cublas(void);
@@ -12,8 +12,8 @@ index 8fe5e0b1..53bf39c1 100644
 +
 using json = nlohmann::json;
- struct server_params {
+ bool server_verbose = false;
-@@ -363,6 +367,10 @@ struct llama_server_context
+@@ -664,6 +668,10 @@ struct server_context {
             llama_free_model(model);
             model = nullptr;
         }
@@ -23,8 +23,8 @@ index 8fe5e0b1..53bf39c1 100644
 +#endif
     }
-     bool load_model(const gpt_params &params_)
+     bool load_model(const gpt_params & params_) {
-@@ -3543,6 +3551,7 @@ int main(int argc, char **argv)
+@@ -3499,6 +3507,7 @@ int main(int argc, char ** argv) {
     sigemptyset (&sigint_action.sa_mask);
     sigint_action.sa_flags = 0;
     sigaction(SIGINT, &sigint_action, NULL);
@@ -33,10 +33,10 @@ index 8fe5e0b1..53bf39c1 100644
     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
 diff --git a/ggml-cuda.cu b/ggml-cuda.cu
-index 72bcec8c..6c934e8c 100644
+index c207ff87..945708a4 100644
 --- a/ggml-cuda.cu
 +++ b/ggml-cuda.cu
-@@ -43,6 +43,7 @@
+@@ -46,6 +46,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
@@ -44,7 +44,7 @@ index 72bcec8c..6c934e8c 100644
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
 #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
+@@ -8014,10 +8015,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
@@ -58,7 +58,7 @@ index 72bcec8c..6c934e8c 100644
 #ifdef __HIP_PLATFORM_AMD__
         // Workaround for a rocBLAS bug when using multiple graphics cards:
-@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8027,7 +8028,7 @@ GGML_CALL void ggml_init_cublas() {
 #endif
         if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
@@ -67,7 +67,7 @@ index 72bcec8c..6c934e8c 100644
             g_cublas_loaded = false;
             fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
             return;
-@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8098,7 +8099,7 @@ GGML_CALL void ggml_init_cublas() {
         // configure logging to stdout
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
@@ -76,7 +76,7 @@ index 72bcec8c..6c934e8c 100644
         g_cublas_loaded = true;
     }
 }
-@@ -12490,3 +12491,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
+@@ -11753,3 +11754,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
     }
     return device_count;
 }
@@ -100,7 +100,6 @@ index 72bcec8c..6c934e8c 100644
 +
 +    g_cublas_initialized = false;
 +}
 \ No newline at end of file
 diff --git a/ggml-cuda.h b/ggml-cuda.h
 index b1ebd61d..6dd58ddf 100644
 --- a/ggml-cuda.h
--- a/llm/patches/03-load_exception.diff
+++ b/llm/patches/03-load_exception.diff
@@ -1,44 +0,0 @@
 diff --git a/llama.cpp b/llama.cpp
 index 4225f955..7b762f86 100644
 --- a/llama.cpp
 +++ b/llama.cpp
@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
 -        return -1;
 +        throw;
     }
     return 0;
@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
         };
     }
 -    int status = llama_model_load(path_model, *model, params);
 -    GGML_ASSERT(status <= 0);
 -    if (status < 0) {
 -        if (status == -1) {
 -            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
 -        } else if (status == -2) {
 -            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
 +    try {
 +        int status = llama_model_load(path_model, *model, params);
 +        GGML_ASSERT(status <= 0);
 +        if (status < 0) {
 +            if (status == -1) {
 +                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
 +            } else if (status == -2) {
 +                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
 +            }
 +            delete model;
 +            return nullptr;
         }
 +    } catch (...) {
 +        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
         delete model;
 -        return nullptr;
 +        throw;
     }
     return model;
--- a/llm/patches/03-locale.diff
+++ b/llm/patches/03-locale.diff
@@ -1,10 +1,10 @@
 diff --git a/llama.cpp b/llama.cpp
-index b27aa272..99372f9c 100644
+index b19616e8..519b9602 100644
 --- a/llama.cpp
 +++ b/llama.cpp
-@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
+@@ -9938,7 +9938,7 @@ struct llm_tokenizer_wpm {
     }
- 
+
     uint32_t to_lower(uint32_t code) {
 -        static const std::locale locale("en_US.UTF-8");
 +        static const std::locale locale("");
--- a/llm/patches/05-fix-clip-free.diff
+++ b/llm/patches/05-fix-clip-free.diff
@@ -1,45 +0,0 @@
 From 9192432daf90b1bfec75577434a99b4ea70d54c8 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Thu, 14 Mar 2024 12:09:50 -0700
 Subject: [PATCH] fix clip free
 ---
 examples/llava/clip.cpp    | 4 ++++
 examples/server/server.cpp | 6 ++++++
 2 files changed, 10 insertions(+)
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index ef9e4ba7..b4ddfe6b 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -1673,6 +1673,10 @@ void clip_free(clip_ctx * ctx) {
     ggml_free(ctx->ctx_data);
     gguf_free(ctx->ctx_gguf);
 +    ggml_backend_buffer_free(ctx->params_buffer);
 +    ggml_backend_buffer_free(ctx->compute_buffer);
 +    ggml_backend_free(ctx->backend);
 +    ggml_gallocr_free(ctx->compute_alloc);
     delete ctx;
 }
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
 index 8fe5e0b1..f927336b 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
@@ -353,6 +353,12 @@ struct llama_server_context
     ~llama_server_context()
     {
 +        if (clp_ctx)
 +        {
 +            LOG_INFO("freeing clip model", {});
 +            clip_free(clp_ctx);
 +            clp_ctx = nullptr;
 +        }
         if (ctx)
         {
             llama_free(ctx);
 -- 
 2.43.2
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -10,8 +10,8 @@ mkdir -p dist
 for TARGETARCH in arm64 amd64; do
    rm -rf llm/llama.cpp/build
    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
 done
 lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -53,7 +53,7 @@ function buildOllama() {
    write-host "Building ollama CLI"
    & go generate ./...
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & go build -trimpath -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
+    & go build -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
@@ -68,7 +68,7 @@ function buildApp() {
    write-host "Building Ollama App"
    cd "${script:SRC_DIR}\app"
    & windres -l 0 -o ollama.syso ollama.rc
-    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
+    & go build -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -9,7 +9,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
    # Centos 7 derivatives have too old of a git version to run our generate script
    # uninstall and ignore failures
    yum remove -y git
-    yum -y install epel-release centos-release-scl
+    yum -y install epel-release centos-release-scl prelink
    yum -y install dnf
    if [ "${MACHINE}" = "x86_64" ]; then
        yum -y install https://repo.ius.io/ius-release-el7.rpm
--- a/server/routes.go
+++ b/server/routes.go
@@ -8,7 +8,6 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
 	"math"
 	"net"
 	"net/http"
 	"net/netip"
@@ -17,7 +16,6 @@ import (
 	"path/filepath"
 	"reflect"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -209,7 +207,7 @@ func GenerateHandler(c *gin.Context) {
 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = getDefaultSessionDuration()
+		sessionDuration = defaultSessionDuration
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
@@ -386,32 +384,6 @@ func GenerateHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }
 func getDefaultSessionDuration() time.Duration {
 	if t, exists := os.LookupEnv("OLLAMA_KEEP_ALIVE"); exists {
 		v, err := strconv.Atoi(t)
 		if err != nil {
 			d, err := time.ParseDuration(t)
 			if err != nil {
 				return defaultSessionDuration
 			}
 			if d < 0 {
 				return time.Duration(math.MaxInt64)
 			}
 			return d
 		}
 		d := time.Duration(v) * time.Second
 		if d < 0 {
 			return time.Duration(math.MaxInt64)
 		}
 		return d
 	}
 	return defaultSessionDuration
 }
 func EmbeddingsHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()
@@ -455,7 +427,7 @@ func EmbeddingsHandler(c *gin.Context) {
 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = getDefaultSessionDuration()
+		sessionDuration = defaultSessionDuration
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
@@ -1256,7 +1228,7 @@ func ChatHandler(c *gin.Context) {
 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = getDefaultSessionDuration()
+		sessionDuration = defaultSessionDuration
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}