Compare commits
26 Commits
brucemacd/
...
mattw/what
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69a1ce5527 | ||
|
|
b5939008a1 | ||
|
|
e9ce91e9a6 | ||
|
|
4ad6c9b11f | ||
|
|
c0285158a9 | ||
|
|
77a66df72c | ||
|
|
5b4837f881 | ||
|
|
29340c2e62 | ||
|
|
d5ec730354 | ||
|
|
8bed487aba | ||
|
|
c1a10a6e9b | ||
|
|
ddbfa6fe31 | ||
|
|
2fcd41ef81 | ||
|
|
16f4603b67 | ||
|
|
1184686649 | ||
|
|
2588cb2daa | ||
|
|
c7ea8f237e | ||
|
|
0b3118e0af | ||
|
|
05face44ef | ||
|
|
a2ad952440 | ||
|
|
5fea4410be | ||
|
|
b846eb64d0 | ||
|
|
3c5dd9ed1d | ||
|
|
b17ccd0542 | ||
|
|
d0409f772f | ||
|
|
ec261422af |
28
cmd/cmd.go
28
cmd/cmd.go
@@ -662,10 +662,11 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
|
||||
|
||||
usage := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /set Set session variables")
|
||||
fmt.Fprintln(os.Stderr, " /show Show model information")
|
||||
fmt.Fprintln(os.Stderr, " /bye Exit")
|
||||
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
|
||||
fmt.Fprintln(os.Stderr, " /set Set session variables")
|
||||
fmt.Fprintln(os.Stderr, " /show Show model information")
|
||||
fmt.Fprintln(os.Stderr, " /bye Exit")
|
||||
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
|
||||
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
@@ -687,6 +688,21 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
usageShortcuts := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + a Move to the beginning of the line (Home)")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + e Move to the end of the line (End)")
|
||||
fmt.Fprintln(os.Stderr, " Alt + b Move back (left) one word")
|
||||
fmt.Fprintln(os.Stderr, " Alt + f Move forward (right) one word")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + k Delete the sentence after the cursor")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + u Delete the sentence before the cursor")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + l Clear the screen")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + c Stop the model from responding")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + d Exit ollama (/bye)")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
usageShow := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /show license Show model license")
|
||||
@@ -737,7 +753,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
|
||||
return nil
|
||||
case errors.Is(err, readline.ErrInterrupt):
|
||||
if line == "" {
|
||||
fmt.Println("\nUse Ctrl-D or /bye to exit.")
|
||||
fmt.Println("\nUse Ctrl + d or /bye to exit.")
|
||||
}
|
||||
|
||||
scanner.Prompt.UseAlt = false
|
||||
@@ -940,6 +956,8 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
|
||||
usageSet()
|
||||
case "show", "/show":
|
||||
usageShow()
|
||||
case "shortcut", "shortcuts":
|
||||
usageShortcuts()
|
||||
}
|
||||
} else {
|
||||
usage()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Documentation
|
||||
|
||||
To get started, see the project's **[quicktart](../README.md#quickstart)**.
|
||||
To get started, see the project's **[quickstart](../README.md#quickstart)**.
|
||||
|
||||
Ollama is a tool for running AI models on your hardware. Many users will choose to use the Command Line Interface (CLI) to work with Ollama. Learn more about all the commands in the CLI in the **[Main Readme](../README.md)**.
|
||||
|
||||
@@ -22,4 +22,4 @@ Finally for all the questions that don't fit anywhere else, there is the **[FAQ]
|
||||
|
||||
[Tutorials](./tutorials.md) apply the documentation to tasks.
|
||||
|
||||
For working code examples of using Ollama, see [Examples](../examples).
|
||||
For working code examples of using Ollama, see [Examples](../examples).
|
||||
|
||||
@@ -46,7 +46,7 @@ Advanced parameters (optional):
|
||||
- `format`: the format to return a response in. Currently the only accepted value is `json`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `system`: system message to (overrides what is defined in the `Modelfile`)
|
||||
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
|
||||
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
||||
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.
|
||||
@@ -377,7 +377,7 @@ Advanced parameters (optional):
|
||||
|
||||
- `format`: the format to return a response in. Currently the only accepted value is `json`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
|
||||
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
|
||||
### Examples
|
||||
|
||||
@@ -8,6 +8,12 @@ To upgrade Ollama, run the installation process again. On the Mac, click the Oll
|
||||
|
||||
Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.
|
||||
|
||||
## What are the components of Ollama that need to be running to work with the CLI, the API, and 3rd party tools?
|
||||
|
||||
At the heart of Ollama there are two main components: the server and the client. Even if everything is running on a single machine, there is a server that is running as a service, or background process, and there is some sort of client. Often that is the CLI. For instance, `ollama run llama2` is the command to start the CLI. You will often see this referred to as the REPL, a tool where you can interactively work with Ollama. You can run the server using the command `ollama serve`, but we recommend letting the service run instead. The Ollama installer script for Linux will add a systemd service to your machine that runs `ollama serve` as the user, `ollama`. On macOS, running `ollama` will start the Ollama Menu Bar app which is running the service.
|
||||
|
||||
The Ollama service is what actually loads the model and processes your requests. It also serves the API that all clients use, including our REPL and any 3rd party tools. There are some tools that require adding some environment variables to make the service more accessible in different ways. You can learn more about configuring those below.
|
||||
|
||||
## How do I use Ollama server environment variables on Mac
|
||||
|
||||
On macOS, Ollama runs in the background and is managed by the menubar app. If adding environment variables, Ollama will need to be run manually.
|
||||
|
||||
@@ -148,6 +148,7 @@ The quantization options are as follow (from highest highest to lowest levels of
|
||||
- `q5_K_M`
|
||||
- `q6_K`
|
||||
- `q8_0`
|
||||
- `f16`
|
||||
|
||||
## Manually converting & quantizing models
|
||||
|
||||
|
||||
15
gpu/gpu.go
15
gpu/gpu.go
@@ -66,7 +66,7 @@ func GetGPUInfo() GpuInfo {
|
||||
}
|
||||
|
||||
var memInfo C.mem_info_t
|
||||
resp := GpuInfo{"", 0, 0}
|
||||
resp := GpuInfo{}
|
||||
if gpuHandles.cuda != nil {
|
||||
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
|
||||
if memInfo.err != nil {
|
||||
@@ -103,6 +103,19 @@ func GetGPUInfo() GpuInfo {
|
||||
return resp
|
||||
}
|
||||
|
||||
func getCPUMem() (memInfo, error) {
|
||||
var ret memInfo
|
||||
var info C.mem_info_t
|
||||
C.cpu_check_ram(&info)
|
||||
if info.err != nil {
|
||||
defer C.free(unsafe.Pointer(info.err))
|
||||
return ret, fmt.Errorf(C.GoString(info.err))
|
||||
}
|
||||
ret.FreeMemory = uint64(info.free)
|
||||
ret.TotalMemory = uint64(info.total)
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func CheckVRAM() (int64, error) {
|
||||
gpuInfo := GetGPUInfo()
|
||||
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
||||
|
||||
@@ -18,20 +18,30 @@ func CheckVRAM() (int64, error) {
|
||||
|
||||
func GetGPUInfo() GpuInfo {
|
||||
// TODO - Metal vs. x86 macs...
|
||||
|
||||
mem, _ := getCPUMem()
|
||||
return GpuInfo{
|
||||
Library: "default",
|
||||
TotalMemory: 0,
|
||||
FreeMemory: 0,
|
||||
Library: "default",
|
||||
memInfo: mem,
|
||||
}
|
||||
}
|
||||
|
||||
func getCPUMem() (memInfo, error) {
|
||||
return memInfo{
|
||||
TotalMemory: 0,
|
||||
FreeMemory: 0,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||
if opts.NumGPU != -1 {
|
||||
return opts.NumGPU
|
||||
}
|
||||
|
||||
// metal only supported on arm64
|
||||
if runtime.GOARCH == "arm64" {
|
||||
return 1
|
||||
}
|
||||
|
||||
// metal only supported on arm64
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
@@ -9,20 +9,21 @@
|
||||
#include <dlfcn.h>
|
||||
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
|
||||
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
|
||||
#define LOAD_ERR() dlerror()
|
||||
#define LOAD_ERR() strdup(dlerror())
|
||||
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
||||
#else
|
||||
#include <windows.h>
|
||||
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
|
||||
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
|
||||
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
|
||||
|
||||
// TODO - refactor this with proper error message handling on windows
|
||||
inline static char *LOAD_ERR() {
|
||||
static char errbuf[8];
|
||||
snprintf(errbuf, 8, "0x%lx", GetLastError());
|
||||
return errbuf;
|
||||
}
|
||||
#define LOAD_ERR() ({\
|
||||
LPSTR messageBuffer = NULL; \
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
|
||||
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
|
||||
char *resp = strdup(messageBuffer); \
|
||||
LocalFree(messageBuffer); \
|
||||
resp; \
|
||||
})
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -6,11 +6,12 @@
|
||||
void cpu_check_ram(mem_info_t *resp) {
|
||||
resp->err = NULL;
|
||||
MEMORYSTATUSEX info;
|
||||
info.dwLength = sizeof(info);
|
||||
if (GlobalMemoryStatusEx(&info) != 0) {
|
||||
resp->total = info.ullTotalPhys;
|
||||
resp->free = info.ullAvailPhys;
|
||||
} else {
|
||||
resp->err = strdup(LOAD_ERR());
|
||||
resp->err = LOAD_ERR();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -43,9 +43,11 @@ void cuda_init(cuda_init_resp_t *resp) {
|
||||
if (!resp->ch.handle) {
|
||||
// TODO improve error message, as the LOAD_ERR will have typically have the
|
||||
// final path that was checked which might be confusing.
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||
cuda_lib_paths[0], LOAD_ERR());
|
||||
cuda_lib_paths[0], msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
@@ -55,8 +57,10 @@ void cuda_init(cuda_init_resp_t *resp) {
|
||||
if (!l[i].p) {
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||
LOAD_ERR());
|
||||
msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -40,9 +40,11 @@ void rocm_init(rocm_init_resp_t *resp) {
|
||||
resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
|
||||
}
|
||||
if (!resp->rh.handle) {
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Radeon GPUs: %s\n",
|
||||
rocm_lib_paths[0], LOAD_ERR());
|
||||
rocm_lib_paths[0], msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
@@ -51,8 +53,10 @@ void rocm_init(rocm_init_resp_t *resp) {
|
||||
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
|
||||
if (!l[i].p) {
|
||||
UNLOAD_LIBRARY(resp->rh.handle);
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||
LOAD_ERR());
|
||||
msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -23,4 +23,19 @@ func TestBasicGetGPUInfo(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPUMemInfo(t *testing.T) {
|
||||
info, err := getCPUMem()
|
||||
assert.NoError(t, err)
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
t.Skip("CPU memory not populated on darwin")
|
||||
case "linux", "windows":
|
||||
assert.Greater(t, info.TotalMemory, uint64(0))
|
||||
assert.Greater(t, info.FreeMemory, uint64(0))
|
||||
default:
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|
||||
|
||||
10
gpu/types.go
10
gpu/types.go
@@ -1,10 +1,14 @@
|
||||
package gpu
|
||||
|
||||
type memInfo struct {
|
||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||
}
|
||||
|
||||
// Beginning of an `ollama info` command
|
||||
type GpuInfo struct {
|
||||
Library string `json:"library,omitempty"`
|
||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||
memInfo
|
||||
Library string `json:"library,omitempty"`
|
||||
|
||||
// TODO add other useful attributes about the card here for discovery information
|
||||
}
|
||||
|
||||
@@ -153,7 +153,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
|
||||
return server, nil
|
||||
}
|
||||
|
||||
func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
|
||||
func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
var imageData []ImageData
|
||||
@@ -167,23 +167,23 @@ func predict(llm extServer, opts api.Options, ctx context.Context, predict Predi
|
||||
request := map[string]any{
|
||||
"prompt": predict.Prompt,
|
||||
"stream": true,
|
||||
"n_predict": opts.NumPredict,
|
||||
"n_keep": opts.NumKeep,
|
||||
"temperature": opts.Temperature,
|
||||
"top_k": opts.TopK,
|
||||
"top_p": opts.TopP,
|
||||
"tfs_z": opts.TFSZ,
|
||||
"typical_p": opts.TypicalP,
|
||||
"repeat_last_n": opts.RepeatLastN,
|
||||
"repeat_penalty": opts.RepeatPenalty,
|
||||
"presence_penalty": opts.PresencePenalty,
|
||||
"frequency_penalty": opts.FrequencyPenalty,
|
||||
"mirostat": opts.Mirostat,
|
||||
"mirostat_tau": opts.MirostatTau,
|
||||
"mirostat_eta": opts.MirostatEta,
|
||||
"penalize_nl": opts.PenalizeNewline,
|
||||
"seed": opts.Seed,
|
||||
"stop": opts.Stop,
|
||||
"n_predict": predict.Options.NumPredict,
|
||||
"n_keep": predict.Options.NumKeep,
|
||||
"temperature": predict.Options.Temperature,
|
||||
"top_k": predict.Options.TopK,
|
||||
"top_p": predict.Options.TopP,
|
||||
"tfs_z": predict.Options.TFSZ,
|
||||
"typical_p": predict.Options.TypicalP,
|
||||
"repeat_last_n": predict.Options.RepeatLastN,
|
||||
"repeat_penalty": predict.Options.RepeatPenalty,
|
||||
"presence_penalty": predict.Options.PresencePenalty,
|
||||
"frequency_penalty": predict.Options.FrequencyPenalty,
|
||||
"mirostat": predict.Options.Mirostat,
|
||||
"mirostat_tau": predict.Options.MirostatTau,
|
||||
"mirostat_eta": predict.Options.MirostatEta,
|
||||
"penalize_nl": predict.Options.PenalizeNewline,
|
||||
"seed": predict.Options.Seed,
|
||||
"stop": predict.Options.Stop,
|
||||
"image_data": imageData,
|
||||
"cache_prompt": true,
|
||||
}
|
||||
|
||||
@@ -60,7 +60,7 @@ func newDefaultExtServer(model string, adapters, projectors []string, numLayers
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
||||
return predict(llm, llm.Options, ctx, pred, fn)
|
||||
return predict(ctx, llm, pred, fn)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
@@ -10,6 +8,5 @@ func newDefaultExtServer(model string, adapters, projectors []string, numLayers
|
||||
// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
|
||||
// This ensures we can update the PATH at runtime to get everything loaded
|
||||
|
||||
// Should not happen
|
||||
return nil, fmt.Errorf("no default impl on windows - all dynamic")
|
||||
return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, numLayers, opts)
|
||||
}
|
||||
|
||||
@@ -3,14 +3,13 @@
|
||||
init_vars() {
|
||||
LLAMACPP_DIR=gguf
|
||||
PATCHES="0001-Expose-callable-API-for-server.patch"
|
||||
CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
|
||||
# TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
|
||||
CMAKE_DEFS=""
|
||||
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
|
||||
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on"
|
||||
else
|
||||
# TODO - add additional optimization flags...
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@@ -9,11 +9,11 @@ set -o pipefail
|
||||
echo "Starting darwin generate script"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="gguf/build/darwin/metal"
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
;;
|
||||
"arm64")
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}"
|
||||
|
||||
@@ -16,11 +16,33 @@
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
|
||||
amdGPUs() {
|
||||
GPU_LIST=(
|
||||
"gfx803"
|
||||
"gfx900"
|
||||
"gfx906:xnack-"
|
||||
"gfx908:xnack-"
|
||||
"gfx90a:xnack+"
|
||||
"gfx90a:xnack-"
|
||||
"gfx1010"
|
||||
"gfx1012"
|
||||
"gfx1030"
|
||||
"gfx1100"
|
||||
"gfx1101"
|
||||
"gfx1102"
|
||||
)
|
||||
(
|
||||
IFS=$';'
|
||||
echo "'${GPU_LIST[*]}'"
|
||||
)
|
||||
}
|
||||
|
||||
echo "Starting linux generate script"
|
||||
if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
|
||||
export CUDACXX=/usr/local/cuda/bin/nvcc
|
||||
fi
|
||||
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
|
||||
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
git_module_setup
|
||||
@@ -35,6 +57,9 @@ BUILD_DIR="gguf/build/linux/cpu"
|
||||
build
|
||||
install
|
||||
|
||||
# Placeholder to keep go embed happy until we start building dynamic CPU lib variants
|
||||
touch ${BUILD_DIR}/lib/dummy.so
|
||||
|
||||
if [ -d /usr/local/cuda/lib64/ ]; then
|
||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||
init_vars
|
||||
@@ -72,7 +97,7 @@ fi
|
||||
if [ -d "${ROCM_PATH}" ]; then
|
||||
echo "ROCm libraries detected - building dynamic ROCm library"
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
BUILD_DIR="gguf/build/linux/rocm"
|
||||
build
|
||||
install
|
||||
|
||||
@@ -4,7 +4,7 @@ $ErrorActionPreference = "Stop"
|
||||
|
||||
function init_vars {
|
||||
$script:patches = @("0001-Expose-callable-API-for-server.patch")
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-DLLAMA_K_QUANTS=on", "-DLLAMA_ACCELERATE=on", "-A","x64")
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
|
||||
$script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
|
||||
@@ -49,6 +49,9 @@ function install {
|
||||
md "${script:buildDir}/lib" -ea 0 > $null
|
||||
cp "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" "${script:buildDir}/lib"
|
||||
cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
|
||||
|
||||
# Display the dll dependencies in the build log
|
||||
dumpbin /dependents "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" | select-string ".dll"
|
||||
}
|
||||
|
||||
function cleanup {
|
||||
|
||||
@@ -166,9 +166,10 @@ const maxRetries = 3
|
||||
const retryDelay = 1 * time.Second
|
||||
|
||||
type PredictOpts struct {
|
||||
Prompt string
|
||||
Format string
|
||||
Images []api.ImageData
|
||||
Prompt string
|
||||
Format string
|
||||
Images []api.ImageData
|
||||
Options api.Options
|
||||
}
|
||||
|
||||
type PredictResult struct {
|
||||
|
||||
19
llm/llm.go
19
llm/llm.go
@@ -41,16 +41,6 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
}
|
||||
|
||||
if runtime.GOOS == "darwin" {
|
||||
switch ggml.FileType() {
|
||||
case "F32", "Q5_0", "Q5_1", "Q8_0":
|
||||
if ggml.Name() != "gguf" && opts.NumGPU != 0 {
|
||||
// GGML Q8_0 do not support Metal API and will
|
||||
// cause the runner to segmentation fault so disable GPU
|
||||
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
|
||||
opts.NumGPU = 0
|
||||
}
|
||||
}
|
||||
|
||||
var requiredMemory int64
|
||||
var f16Multiplier int64 = 2
|
||||
|
||||
@@ -61,6 +51,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
requiredMemory = 16 * format.GigaByte
|
||||
case "30B", "34B", "40B":
|
||||
requiredMemory = 32 * format.GigaByte
|
||||
case "47B":
|
||||
requiredMemory = 48 * format.GigaByte
|
||||
case "65B", "70B":
|
||||
requiredMemory = 64 * format.GigaByte
|
||||
case "180B":
|
||||
@@ -71,9 +63,9 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
systemMemory := int64(memory.TotalMemory())
|
||||
|
||||
if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {
|
||||
return nil, fmt.Errorf("F16 model requires at least %s of total memory", format.HumanBytes(requiredMemory))
|
||||
return nil, fmt.Errorf("F16 model requires at least %s of memory", format.HumanBytes(requiredMemory))
|
||||
} else if requiredMemory > systemMemory {
|
||||
return nil, fmt.Errorf("model requires at least %s of total memory", format.HumanBytes(requiredMemory))
|
||||
return nil, fmt.Errorf("model requires at least %s of memory", format.HumanBytes(requiredMemory))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +87,8 @@ func newLlmServer(library, model string, adapters, projectors []string, numLayer
|
||||
if err == nil {
|
||||
return srv, nil
|
||||
}
|
||||
log.Printf("Failed to load dynamic library - falling back to CPU mode %s", err)
|
||||
log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
|
||||
// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
|
||||
}
|
||||
|
||||
return newDefaultExtServer(model, adapters, projectors, numLayers, opts)
|
||||
|
||||
@@ -92,7 +92,7 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
||||
return predict(llm, llm.options, ctx, pred, fn)
|
||||
return predict(ctx, llm, pred, fn)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
@@ -147,9 +147,9 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, payloadMissing
|
||||
}
|
||||
libs := make([]string, len(files))
|
||||
libs := []string{}
|
||||
|
||||
for i, file := range files {
|
||||
for _, file := range files {
|
||||
pathComps := strings.Split(file, "/")
|
||||
if len(pathComps) != 7 {
|
||||
log.Printf("unexpected payload components: %v", pathComps)
|
||||
@@ -169,7 +169,7 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||
|
||||
destFile := filepath.Join(targetDir, filepath.Base(file))
|
||||
if strings.Contains(destFile, "server") {
|
||||
libs[i] = destFile
|
||||
libs = append(libs, destFile)
|
||||
}
|
||||
|
||||
_, err = os.Stat(destFile)
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"embed"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -11,14 +12,20 @@ import (
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
tmpDir := filepath.Dir(dir)
|
||||
pathComponents := strings.Split(os.Getenv("PATH"), ";")
|
||||
i := 0
|
||||
for _, comp := range pathComponents {
|
||||
// Case incensitive
|
||||
if strings.ToLower(comp) == strings.ToLower(dir) {
|
||||
if strings.EqualFold(comp, dir) {
|
||||
return
|
||||
}
|
||||
// Remove any other prior paths to our temp dir
|
||||
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
|
||||
pathComponents[i] = comp
|
||||
i++
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append(pathComponents, dir), ";")
|
||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
||||
log.Printf("Updating PATH to %s", newPath)
|
||||
os.Setenv("PATH", newPath)
|
||||
}
|
||||
|
||||
@@ -5,13 +5,11 @@ set -eu
|
||||
export VERSION=${VERSION:-0.0.0}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
|
||||
|
||||
docker buildx build \
|
||||
docker build \
|
||||
--load \
|
||||
--platform=linux/arm64,linux/amd64 \
|
||||
--build-arg=VERSION \
|
||||
--build-arg=GOFLAGS \
|
||||
--cache-from type=local,src=.cache \
|
||||
--cache-to type=local,dest=.cache \
|
||||
-f Dockerfile \
|
||||
-t ollama/ollama:$VERSION \
|
||||
.
|
||||
|
||||
@@ -8,7 +8,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
|
||||
mkdir -p dist
|
||||
|
||||
for TARGETARCH in amd64 arm64; do
|
||||
docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS --build-arg=CGO_CFLAGS --cache-from type=local,src=.cache --cache-to type=local,dest=.cache -f Dockerfile.build -t builder:$TARGETARCH .
|
||||
docker build --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
|
||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
||||
docker rm builder-$TARGETARCH
|
||||
|
||||
@@ -33,6 +33,14 @@ case "$ARCH" in
|
||||
*) error "Unsupported architecture: $ARCH" ;;
|
||||
esac
|
||||
|
||||
KERN=$(uname -r)
|
||||
case "$KERN" in
|
||||
*icrosoft*WSL2 | *icrosoft*wsl2) ;;
|
||||
*icrosoft) error "Microsoft WSL1 is not currently supported. Please upgrade to WSL2 with 'wsl --set-version <distro> 2'" ;;
|
||||
*) ;;
|
||||
esac
|
||||
|
||||
|
||||
SUDO=
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
# Running as root, no need for sudo
|
||||
@@ -76,6 +84,10 @@ configure_systemd() {
|
||||
status "Creating ollama user..."
|
||||
$SUDO useradd -r -s /bin/false -m -d /usr/share/ollama ollama
|
||||
fi
|
||||
if getent group render >/dev/null 2>&1; then
|
||||
status "Adding ollama user to render group..."
|
||||
$SUDO usermod -a -G render ollama
|
||||
fi
|
||||
|
||||
status "Adding current user to ollama group..."
|
||||
$SUDO usermod -a -G ollama $(whoami)
|
||||
|
||||
@@ -5,12 +5,11 @@ set -eu
|
||||
export VERSION=${VERSION:-0.0.0}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
|
||||
|
||||
docker buildx build \
|
||||
docker build \
|
||||
--push \
|
||||
--platform=linux/arm64,linux/amd64 \
|
||||
--build-arg=VERSION \
|
||||
--build-arg=GOFLAGS \
|
||||
--cache-from type=local,src=.cache \
|
||||
-f Dockerfile \
|
||||
-t ollama/ollama -t ollama/ollama:$VERSION \
|
||||
.
|
||||
|
||||
@@ -485,9 +485,15 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := PullModel(ctx, parent.OriginalModel, &RegistryOptions{}, fn); err != nil {
|
||||
log.Printf("error pulling model: %v", err)
|
||||
|
||||
originalModel := parent.OriginalModel
|
||||
if originalModel == "" {
|
||||
originalModel = parent.ShortName
|
||||
}
|
||||
if err := PullModel(ctx, originalModel, &RegistryOptions{}, fn); err != nil {
|
||||
log.Printf("error pulling parent model: %v", err)
|
||||
}
|
||||
|
||||
// Reset the file pointer to the beginning of the file
|
||||
_, err = fromConfigFile.Seek(0, 0)
|
||||
if err != nil {
|
||||
|
||||
130
server/routes.go
130
server/routes.go
@@ -64,24 +64,9 @@ var loaded struct {
|
||||
var defaultSessionDuration = 5 * time.Minute
|
||||
|
||||
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
|
||||
func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sessionDuration time.Duration) (*Model, error) {
|
||||
model, err := GetModel(modelName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
|
||||
workDir := c.GetString("workDir")
|
||||
|
||||
opts := api.DefaultOptions()
|
||||
if err := opts.FromMap(model.Options); err != nil {
|
||||
log.Printf("could not load model options: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := opts.FromMap(reqOpts); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
needLoad := loaded.runner == nil || // is there a model loaded?
|
||||
loaded.ModelPath != model.ModelPath || // has the base model changed?
|
||||
!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
|
||||
@@ -105,7 +90,7 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
|
||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName)
|
||||
}
|
||||
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
loaded.Model = model
|
||||
@@ -135,7 +120,20 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
|
||||
}
|
||||
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
return model, nil
|
||||
return nil
|
||||
}
|
||||
|
||||
func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) {
|
||||
opts := api.DefaultOptions()
|
||||
if err := opts.FromMap(model.Options); err != nil {
|
||||
return api.Options{}, err
|
||||
}
|
||||
|
||||
if err := opts.FromMap(requestOpts); err != nil {
|
||||
return api.Options{}, err
|
||||
}
|
||||
|
||||
return opts, nil
|
||||
}
|
||||
|
||||
func GenerateHandler(c *gin.Context) {
|
||||
@@ -168,18 +166,30 @@ func GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
sessionDuration := defaultSessionDuration
|
||||
model, err := load(c, req.Model, req.Options, sessionDuration)
|
||||
model, err := GetModel(req.Model)
|
||||
if err != nil {
|
||||
var pErr *fs.PathError
|
||||
switch {
|
||||
case errors.As(err, &pErr):
|
||||
if errors.As(err, &pErr) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
|
||||
case errors.Is(err, api.ErrInvalidOpts):
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
opts, err := modelOptions(model, req.Options)
|
||||
if err != nil {
|
||||
if errors.Is(err, api.ErrInvalidOpts) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
sessionDuration := defaultSessionDuration
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -287,9 +297,10 @@ func GenerateHandler(c *gin.Context) {
|
||||
|
||||
// Start prediction
|
||||
predictReq := llm.PredictOpts{
|
||||
Prompt: prompt,
|
||||
Format: req.Format,
|
||||
Images: req.Images,
|
||||
Prompt: prompt,
|
||||
Format: req.Format,
|
||||
Images: req.Images,
|
||||
Options: opts,
|
||||
}
|
||||
if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
@@ -347,18 +358,29 @@ func EmbeddingHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
sessionDuration := defaultSessionDuration
|
||||
_, err = load(c, req.Model, req.Options, sessionDuration)
|
||||
model, err := GetModel(req.Model)
|
||||
if err != nil {
|
||||
var pErr *fs.PathError
|
||||
switch {
|
||||
case errors.As(err, &pErr):
|
||||
if errors.As(err, &pErr) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
|
||||
case errors.Is(err, api.ErrInvalidOpts):
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
opts, err := modelOptions(model, req.Options)
|
||||
if err != nil {
|
||||
if errors.Is(err, api.ErrInvalidOpts) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
sessionDuration := defaultSessionDuration
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -991,18 +1013,29 @@ func ChatHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
sessionDuration := defaultSessionDuration
|
||||
model, err := load(c, req.Model, req.Options, sessionDuration)
|
||||
model, err := GetModel(req.Model)
|
||||
if err != nil {
|
||||
var pErr *fs.PathError
|
||||
switch {
|
||||
case errors.As(err, &pErr):
|
||||
if errors.As(err, &pErr) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
|
||||
case errors.Is(err, api.ErrInvalidOpts):
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
opts, err := modelOptions(model, req.Options)
|
||||
if err != nil {
|
||||
if errors.Is(err, api.ErrInvalidOpts) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
sessionDuration := defaultSessionDuration
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1053,9 +1086,10 @@ func ChatHandler(c *gin.Context) {
|
||||
|
||||
// Start prediction
|
||||
predictReq := llm.PredictOpts{
|
||||
Prompt: prompt,
|
||||
Format: req.Format,
|
||||
Images: images,
|
||||
Prompt: prompt,
|
||||
Format: req.Format,
|
||||
Images: images,
|
||||
Options: opts,
|
||||
}
|
||||
if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
|
||||
Reference in New Issue
Block a user