Compare commits

..

15 Commits

Author SHA1 Message Date
Jeffrey Morgan
2a7553ce09 update llama.cpp submodule to c14f72d 2024-02-21 09:03:14 -05:00
Sun Bo
10af6070a9 Update big-AGI config file link (#2626)
Co-authored-by: bo.sun <bo.sun@cotticoffee.com>
2024-02-21 01:24:48 -05:00
Jeffrey Morgan
92423b0600 add dist directory in build_windows.ps 2024-02-21 00:05:05 -05:00
Jeffrey Morgan
b3eac61cac update llama.cpp submodule to f0d1fafc029a056cd765bdae58dcaa12312e9879 2024-02-20 22:56:51 -05:00
Jeffrey Morgan
287ba11500 better error message when calling /api/generate or /api/chat with embedding models 2024-02-20 21:53:45 -05:00
Jeffrey Morgan
63861f58cc Support for bert and nomic-bert embedding models 2024-02-20 21:37:29 -05:00
Jeffrey Morgan
f0425d3de9 Update faq.md 2024-02-20 20:44:45 -05:00
Michael Yang
210b65268e replace strings buffer with hasher (#2437)
the buffered value is going into the hasher eventually so write directly
to the hasher instead
2024-02-20 19:07:50 -05:00
Michael Yang
949d7b1c48 add gguf file types (#2532) 2024-02-20 19:06:29 -05:00
Michael Yang
897b213468 use http.DefaultClient (#2530)
default client already handles proxy
2024-02-20 18:34:47 -05:00
Jeffrey Morgan
4613a080e7 update llama.cpp submodule to 66c1968f7 (#2618) 2024-02-20 17:42:31 -05:00
Muhammed Nazeem
ace2cdf1c6 Add Page Assist to the community integrations (#2447) 2024-02-20 14:03:58 -05:00
Nikesh Parajuli
eed92bc19a docs: add Msty app in readme (#1775)
* docs: add Msty app in readme

* docs: update msty url
2024-02-20 14:03:33 -05:00
Michael Edoror
e0a2f46466 Update README.md to include Elixir LangChain Library (#2180)
The Elixir LangChain Library now supports Ollama Chat with this [PR](https://github.com/brainlid/langchain/pull/70)
2024-02-20 14:03:02 -05:00
Taras Tsugrii
01ff2e14db [nit] Remove unused msg local var. (#2511) 2024-02-20 14:02:34 -05:00
16 changed files with 95 additions and 211 deletions

View File

@@ -264,13 +264,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
- [Open WebUI](https://github.com/open-webui/open-webui)
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-agi/blob/main/docs/config-ollama.md)
- [big-AGI](https://github.com/enricoros/big-AGI/blob/main/docs/config-local-ollama.md)
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
- [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd)
- [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
- [MindMac](https://mindmac.app)
- [NextJS Web Interface for Ollama](https://github.com/jakobhoeg/nextjs-ollama-llm-ui)
- [Msty](https://msty.app)
### Terminal
@@ -315,6 +316,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LangChainDart](https://github.com/davidmigloz/langchain_dart)
- [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
- [Elixir LangChain](https://github.com/brainlid/langchain)
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
- [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
@@ -340,3 +342,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)

View File

@@ -21,7 +21,7 @@ import (
type Client struct {
base *url.URL
http http.Client
http *http.Client
}
func checkError(resp *http.Response, body []byte) error {
@@ -66,30 +66,13 @@ func ClientFromEnvironment() (*Client, error) {
}
}
client := Client{
return &Client{
base: &url.URL{
Scheme: scheme,
Host: net.JoinHostPort(host, port),
},
}
mockRequest, err := http.NewRequest(http.MethodHead, client.base.String(), nil)
if err != nil {
return nil, err
}
proxyURL, err := http.ProxyFromEnvironment(mockRequest)
if err != nil {
return nil, err
}
client.http = http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
return &client, nil
http: http.DefaultClient,
}, nil
}
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {

View File

@@ -34,20 +34,6 @@ type UpdateResponse struct {
UpdateVersion string `json:"version"`
}
func getClient(req *http.Request) http.Client {
proxyURL, err := http.ProxyFromEnvironment(req)
if err != nil {
slog.Warn(fmt.Sprintf("failed to handle proxy: %s", err))
return http.Client{}
}
return http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
}
func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
var updateResp UpdateResponse
@@ -83,10 +69,9 @@ func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
}
req.Header.Set("Authorization", signature)
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
client := getClient(req)
slog.Debug("checking for available update", "requestURL", requestURL)
resp, err := client.Do(req)
resp, err := http.DefaultClient.Do(req)
if err != nil {
slog.Warn(fmt.Sprintf("failed to check for update: %s", err))
return false, updateResp
@@ -119,8 +104,8 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
if err != nil {
return err
}
client := getClient(req)
resp, err := client.Do(req)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("error checking update: %w", err)
}
@@ -151,7 +136,7 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
cleanupOldDownloads()
req.Method = http.MethodGet
resp, err = client.Do(req)
resp, err = http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("error checking update: %w", err)
}

View File

@@ -113,9 +113,9 @@ If a different directory needs to be used, set the environment variable `OLLAMA_
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
## Does Ollama send my prompts and answers back to ollama.com?
No, Ollama runs entirely locally, and conversation data will never leave your machine.
No. Ollama runs locally, and conversation data does not leave your machine.
## How can I use Ollama in Visual Studio Code?

View File

@@ -106,7 +106,12 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
sparams.numa = C.bool(opts.UseNUMA)
if opts.UseNUMA {
sparams.numa = C.int(1)
} else {
sparams.numa = C.int(0)
}
sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {
@@ -194,7 +199,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
request["grammar"] = jsonGrammar
}
var whitespace int
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
@@ -253,24 +257,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
break out
}
// detect if p.Content is entirely whitespace
if predict.Format == "json" && strings.TrimSpace(p.Content) == "" {
whitespace++
// if we get 100 consecutive whitespace responses, cancel
if whitespace > 100 {
slog.Debug("cancelling due to excessive whitespace")
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
return nil
}
} else {
whitespace = 0
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,

View File

@@ -80,7 +80,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
params.main_gpu = sparams->main_gpu;
params.use_mlock = sparams->use_mlock;
params.use_mmap = sparams->use_mmap;
params.numa = sparams->numa;
params.numa = (ggml_numa_strategy)sparams->numa;
params.embedding = sparams->embedding;
if (sparams->model != NULL) {
params.model = sparams->model;
@@ -111,7 +111,8 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
}
#endif
llama_backend_init(params.numa);
llama_backend_init();
llama_numa_init(params.numa);
// load the model
if (!llama->load_model(params)) {
@@ -208,7 +209,6 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *resp) {
assert(llama != NULL && resp != NULL);
std::string msg;
resp->id = -1;
resp->stop = false;
resp->error = false;

View File

@@ -41,7 +41,7 @@ typedef struct ext_server_params {
int32_t main_gpu; // the GPU that is used for scratch and small tensors
bool use_mlock; // force system to keep model in RAM
bool use_mmap; // use mmap if possible
bool numa; // attempt optimizations that help on some NUMA systems
int numa; // attempt optimizations that help on some NUMA systems
bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters;
char *mmproj;

View File

@@ -31,6 +31,11 @@ const (
fileTypeQ5_K_S
fileTypeQ5_K_M
fileTypeQ6_K
fileTypeIQ2_XXS
fileTypeIQ2_XS
fileTypeQ2_K_S
fileTypeQ3_K_XS
fileTypeIQ3_XXS
)
func fileType(fileType uint32) string {
@@ -69,6 +74,16 @@ func fileType(fileType uint32) string {
return "Q5_K_M"
case fileTypeQ6_K:
return "Q6_K"
case fileTypeIQ2_XXS:
return "IQ2_XXS"
case fileTypeIQ2_XS:
return "IQ2_XS"
case fileTypeQ2_K_S:
return "Q2_K_S"
case fileTypeQ3_K_XS:
return "Q3_K_XS"
case fileTypeIQ3_XXS:
return "IQ3_XXS"
default:
return "unknown"
}

View File

@@ -115,6 +115,14 @@ func (t tensor) typeSize() uint64 {
return 2 + 2 + 12 + blockSize/8 + blockSize/2
case 14: // Q6_K
return blockSize/2 + blockSize/4 + blockSize/16 + 2
case 15: // Q8_K
return 2 + blockSize + 2*blockSize/16
case 16: // IQ2_XXS
return 2 + 2*blockSize/8
case 17: // IQ2_XS
return 2 + 2*blockSize/8 + blockSize/32
case 18: // IQ3_XXS
return 2 + 3*blockSize/8
default:
return 0
}

View File

@@ -1,30 +1,29 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3102762c..568ac1d0 100644
index 7800c6e7..be30db23 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -307,6 +307,10 @@ struct llama_client_slot
}
};
@@ -30,6 +30,10 @@
#include <atomic>
#include <signal.h>
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
struct llama_server_context
{
llama_model *model = nullptr;
@@ -353,6 +357,10 @@ struct llama_server_context
using json = nlohmann::json;
struct server_params
@@ -353,6 +357,9 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
+
}
bool load_model(const gpt_params &params_)
@@ -3093,6 +3101,7 @@ int main(int argc, char **argv)
@@ -3143,6 +3150,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
@@ -32,13 +31,8 @@ index 3102762c..568ac1d0 100644
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
@@ -3106,3 +3115,4 @@ int main(int argc, char **argv)
llama_backend_free();
return 0;
}
+
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 96976f24..3543920e 100644
index 933ebbc4..88a4f664 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,7 @@
@@ -49,30 +43,30 @@ index 96976f24..3543920e 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) {
@@ -7991,10 +7992,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
+static bool g_cublas_initialized = false;
+
GGML_CALL void ggml_init_cublas() {
-GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
+static bool g_cublas_initialized = false;
- if (!initialized) {
+GGML_CALL void ggml_init_cublas() {
+ if (!g_cublas_initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8004,7 +8005,7 @@ GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
}
@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8075,7 +8076,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
@@ -81,25 +75,30 @@ index 96976f24..3543920e 100644
g_cublas_loaded = true;
}
}
@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -11604,3 +11605,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !defined(GGML_USE_HIPBLAS)
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+ if (g_device_caps[id].vmm) {
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+ }
+#endif
+ // TODO: free legacy non-vmm memory
+ // destroy cublas handle
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+
+ g_cublas_initialized = false;
+}
\ No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..b4c80c2c 100644
--- a/ggml-cuda.h

View File

@@ -1,96 +0,0 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a0b46970..7800c6e7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -28,6 +28,7 @@
#include <chrono>
#include <condition_variable>
#include <atomic>
+#include <signal.h>
using json = nlohmann::json;
@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
+std::function<void(int)> shutdown_handler;
+inline void signal_handler(int signal) { shutdown_handler(signal); }
+
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@@ -3128,8 +3132,25 @@ int main(int argc, char **argv)
std::placeholders::_2,
std::placeholders::_3
));
- llama.queue_tasks.start_loop();
+ shutdown_handler = [&](int) {
+ llama.queue_tasks.terminate();
+ };
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+ struct sigaction sigint_action;
+ sigint_action.sa_handler = signal_handler;
+ sigemptyset (&sigint_action.sa_mask);
+ sigint_action.sa_flags = 0;
+ sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+ };
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+ llama.queue_tasks.start_loop();
+ svr.stop();
t.join();
llama_backend_free();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 54854896..0ee670db 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
+ bool running;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
@@ -278,9 +279,18 @@ struct llama_server_queue {
queue_tasks_deferred.clear();
}
- // Start the main loop. This call is blocking
- [[noreturn]]
+ // end the start_loop routine
+ void terminate() {
+ {
+ std::unique_lock<std::mutex> lock(mutex_tasks);
+ running = false;
+ }
+ condition_tasks.notify_all();
+ }
+
+ // Start the main loop.
void start_loop() {
+ running = true;
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
@@ -324,8 +334,12 @@ struct llama_server_queue {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
+ if (!running) {
+ LOG_VERBOSE("ending start_loop", {});
+ return;
+ }
condition_tasks.wait(lock, [&]{
- return !queue_tasks.empty();
+ return (!queue_tasks.empty() || !running);
});
}
}

View File

@@ -60,6 +60,7 @@ function buildOllama() {
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
New-Item -ItemType Directory -Path .\dist -Force
cp .\ollama.exe .\dist\ollama-windows-amd64.exe
}

View File

@@ -52,6 +52,10 @@ type Model struct {
Messages []Message
}
func (m *Model) IsEmbedding() bool {
return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
}
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
@@ -1103,18 +1107,7 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
req.ContentLength = contentLength
}
proxyURL, err := http.ProxyFromEnvironment(req)
if err != nil {
return nil, err
}
client := http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
resp, err := client.Do(req)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}

View File

@@ -191,6 +191,11 @@ func GenerateHandler(c *gin.Context) {
return
}
if model.IsEmbedding() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "embedding models do not support generate"})
return
}
opts, err := modelOptions(model, req.Options)
if err != nil {
if errors.Is(err, api.ErrInvalidOpts) {
@@ -1143,6 +1148,11 @@ func ChatHandler(c *gin.Context) {
return
}
if model.IsEmbedding() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "embedding models do not support chat"})
return
}
opts, err := modelOptions(model, req.Options)
if err != nil {
if errors.Is(err, api.ErrInvalidOpts) {

View File

@@ -12,7 +12,6 @@ import (
"net/http"
"net/url"
"os"
"strings"
"sync"
"sync/atomic"
"time"
@@ -177,16 +176,14 @@ func (b *blobUpload) Run(ctx context.Context, opts *registryOptions) {
requestURL := <-b.nextURL
// calculate md5 checksum and add it to the commit request
var sb strings.Builder
md5sum := md5.New()
for _, part := range b.Parts {
sb.Write(part.Sum(nil))
md5sum.Write(part.Sum(nil))
}
md5sum := md5.Sum([]byte(sb.String()))
values := requestURL.Query()
values.Add("digest", b.Digest)
values.Add("etag", fmt.Sprintf("%x-%d", md5sum, len(b.Parts)))
values.Add("etag", fmt.Sprintf("%x-%d", md5sum.Sum(nil), len(b.Parts)))
requestURL.RawQuery = values.Encode()
headers := make(http.Header)