Compare commits
17 Commits
v0.4.0-rc7
...
v0.4.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c2e8cbaa14 | ||
|
|
771fab1dd8 | ||
|
|
3a5239e6bf | ||
|
|
3d25e7bf8c | ||
|
|
1618700c5a | ||
|
|
b111aa5a91 | ||
|
|
9e83e550e1 | ||
|
|
fc2a0715df | ||
|
|
3020d2dc58 | ||
|
|
a909417602 | ||
|
|
6cd566872b | ||
|
|
9d71bcc3e2 | ||
|
|
a4c70fe157 | ||
|
|
34a75102f7 | ||
|
|
4157d1f7b6 | ||
|
|
4ebfa2cb91 | ||
|
|
046054fa3b |
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
@@ -12,7 +12,7 @@ on:
|
||||
jobs:
|
||||
# Full build of the Mac assets
|
||||
build-darwin:
|
||||
runs-on: macos-12
|
||||
runs-on: macos-13
|
||||
environment: release
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -43,8 +43,8 @@ jobs:
|
||||
APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
|
||||
APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
|
||||
APPLE_ID: ${{ vars.APPLE_ID }}
|
||||
SDKROOT: /Applications/Xcode_13.4.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
||||
DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
|
||||
SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
||||
DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
|
||||
run: |
|
||||
./scripts/build_darwin.sh
|
||||
|
||||
|
||||
@@ -331,6 +331,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
||||
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
||||
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
||||
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
|
||||
- [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
|
||||
|
||||
### Terminal
|
||||
|
||||
|
||||
@@ -236,7 +236,7 @@ type Runner struct {
|
||||
NumGPU int `json:"num_gpu,omitempty"`
|
||||
MainGPU int `json:"main_gpu,omitempty"`
|
||||
LowVRAM bool `json:"low_vram,omitempty"`
|
||||
F16KV bool `json:"f16_kv,omitempty"`
|
||||
F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
|
||||
LogitsAll bool `json:"logits_all,omitempty"`
|
||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||
@@ -613,7 +613,6 @@ func DefaultOptions() Options {
|
||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||
NumThread: 0, // let the runtime decide
|
||||
LowVRAM: false,
|
||||
F16KV: true,
|
||||
UseMLock: false,
|
||||
UseMMap: nil,
|
||||
},
|
||||
|
||||
@@ -136,7 +136,7 @@ Type: filesandordirs; Name: "{%TEMP}\ollama*"
|
||||
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
|
||||
|
||||
[Messages]
|
||||
WizardReady=Ollama Windows Preview
|
||||
WizardReady=Ollama
|
||||
ReadyLabel1=%nLet's get you up and running with your own large language models.
|
||||
SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "gpu_info_nvcuda.h"
|
||||
|
||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
|
||||
CUresult ret;
|
||||
resp->err = NULL;
|
||||
resp->num_devices = 0;
|
||||
@@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
resp->cudaErr = -1;
|
||||
return;
|
||||
}
|
||||
LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
|
||||
}
|
||||
|
||||
LOG(resp->ch.verbose, "calling cuInit\n");
|
||||
ret = (*resp->ch.cuInit)(0);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
||||
@@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
resp->ch.driver_minor = 0;
|
||||
|
||||
// Report driver version if we're in verbose mode, ignore errors
|
||||
LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
|
||||
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(resp->ch.verbose, "raw version 0x%x\n", version);
|
||||
resp->ch.driver_major = version / 1000;
|
||||
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
|
||||
LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
|
||||
}
|
||||
|
||||
LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
|
||||
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
|
||||
@@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
resp->cudaErr = ret;
|
||||
return;
|
||||
}
|
||||
LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
|
||||
}
|
||||
|
||||
const int buflen = 256;
|
||||
|
||||
@@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
|
||||
"num_gpu": 1,
|
||||
"main_gpu": 0,
|
||||
"low_vram": false,
|
||||
"f16_kv": true,
|
||||
"vocab_only": false,
|
||||
"use_mmap": true,
|
||||
"use_mlock": false,
|
||||
|
||||
@@ -108,7 +108,7 @@ Custom CPU settings are not currently supported in the new Go server build but w
|
||||
|
||||
#### Containerized Linux Build
|
||||
|
||||
If you have Docker available, you can build linux binaries with `OLLAMA_NEW_RUNNERS=1 ./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
|
||||
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
|
||||
|
||||
### Windows
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ This sounds like a typical censored response, but even llama2-uncensored gives a
|
||||
|
||||
So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
|
||||
|
||||
Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:
|
||||
Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:
|
||||
|
||||
`pip install langchain_community`
|
||||
|
||||
|
||||
@@ -58,6 +58,8 @@ endif
|
||||
GPU_COMPILER_CUFLAGS = \
|
||||
$(GPU_COMPILER_FPIC) \
|
||||
$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
|
||||
-mf16c \
|
||||
-mfma \
|
||||
-parallel-jobs=2 \
|
||||
-c \
|
||||
-O3 \
|
||||
@@ -77,6 +79,9 @@ GPU_COMPILER_CUFLAGS = \
|
||||
-D_CRT_SECURE_NO_WARNINGS \
|
||||
-D_GNU_SOURCE \
|
||||
-D_XOPEN_SOURCE=600 \
|
||||
-DUSE_PROF_API=1 \
|
||||
-std=gnu++14 \
|
||||
-x hip \
|
||||
-mllvm=-amdgpu-early-inline-all=true \
|
||||
-mllvm=-amdgpu-function-calls=false \
|
||||
-Wno-expansion-to-defined \
|
||||
@@ -87,6 +92,12 @@ GPU_COMPILER_CUFLAGS = \
|
||||
-Wno-unused-result \
|
||||
-I.
|
||||
|
||||
# Workaround buggy P2P copy on some windows multi-GPU setups
|
||||
# This workaround breaks linux systems with small system RAM, so only enable on windows
|
||||
ifeq ($(OS),windows)
|
||||
GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
|
||||
endif
|
||||
|
||||
include make/gpu.make
|
||||
|
||||
# Adjust the rules from gpu.make to handle the ROCm dependencies properly
|
||||
|
||||
@@ -85,7 +85,7 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS
|
||||
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
|
||||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
|
||||
@-mkdir -p $(dir $@)
|
||||
$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
|
||||
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
|
||||
|
||||
# Distribution targets
|
||||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
|
||||
|
||||
@@ -68,6 +68,10 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if len(data) <= 0 {
|
||||
return nil, errors.New("received zero length image")
|
||||
}
|
||||
|
||||
hash := c.hashImage(data)
|
||||
|
||||
c.mu.Lock()
|
||||
|
||||
@@ -837,14 +837,8 @@ func main() {
|
||||
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||
// Expose requirements as a JSON output to stdout
|
||||
requirements := flag.Bool("requirements", false, "print json requirement information")
|
||||
|
||||
// These are either ignored by llama.cpp or have no significance to us
|
||||
_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
|
||||
_ = flag.Bool("log-disable", false, "disables logging to a file")
|
||||
_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
|
||||
|
||||
flag.Parse()
|
||||
if *requirements {
|
||||
printRequirements(os.Stdout)
|
||||
|
||||
@@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
"--model", model,
|
||||
"--ctx-size", strconv.Itoa(opts.NumCtx),
|
||||
"--batch-size", strconv.Itoa(opts.NumBatch),
|
||||
"--embedding",
|
||||
}
|
||||
|
||||
if opts.NumGPU >= 0 {
|
||||
@@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
||||
}
|
||||
|
||||
if !opts.F16KV {
|
||||
params = append(params, "--memory-f32")
|
||||
}
|
||||
|
||||
flashAttnEnabled := envconfig.FlashAttention()
|
||||
|
||||
for _, g := range gpus {
|
||||
|
||||
@@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
|
||||
"num_gpu 1": {"num_gpu", "1"},
|
||||
"main_gpu 1": {"main_gpu", "1"},
|
||||
"low_vram true": {"low_vram", "true"},
|
||||
"f16_kv true": {"f16_kv", "true"},
|
||||
"logits_all true": {"logits_all", "true"},
|
||||
"vocab_only true": {"vocab_only", "true"},
|
||||
"use_mmap true": {"use_mmap", "true"},
|
||||
|
||||
@@ -6,17 +6,18 @@ set -e
|
||||
|
||||
mkdir -p dist
|
||||
|
||||
# These require Xcode v13 or older to target MacOS v11
|
||||
# If installed to an alternate location use the following to enable
|
||||
# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
||||
# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
|
||||
export CGO_CFLAGS=-mmacosx-version-min=11.3
|
||||
export CGO_CXXFLAGS=-mmacosx-version-min=11.3
|
||||
export CGO_LDFLAGS=-mmacosx-version-min=11.3
|
||||
|
||||
for TARGETARCH in arm64 amd64; do
|
||||
echo "Building Go runner darwin $TARGETARCH"
|
||||
rm -rf llama/build
|
||||
GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
|
||||
# These require Xcode v13 or older to target MacOS v11
|
||||
# If installed to an alternate location use the following to enable
|
||||
# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
||||
# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
|
||||
export CGO_CFLAGS=-mmacosx-version-min=11.3
|
||||
export CGO_CXXFLAGS=-mmacosx-version-min=11.3
|
||||
export CGO_LDFLAGS=-mmacosx-version-min=11.3
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
|
||||
done
|
||||
|
||||
@@ -690,7 +690,8 @@ func CopyModel(src, dst model.Name) error {
|
||||
}
|
||||
|
||||
func deleteUnusedLayers(deleteMap map[string]struct{}) error {
|
||||
manifests, err := Manifests()
|
||||
// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
|
||||
manifests, err := Manifests(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -853,8 +854,8 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||
manifest, _, err := GetManifest(mp)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
// noop
|
||||
} else if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
} else if err != nil {
|
||||
slog.Warn("pulling model with bad existing manifest", "name", name, "error", err)
|
||||
} else {
|
||||
for _, l := range manifest.Layers {
|
||||
deleteMap[l.Digest] = struct{}{}
|
||||
|
||||
@@ -106,7 +106,8 @@ func (l *Layer) Remove() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
ms, err := Manifests()
|
||||
// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
|
||||
ms, err := Manifests(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -123,7 +123,7 @@ func WriteManifest(name model.Name, config Layer, layers []Layer) error {
|
||||
return json.NewEncoder(f).Encode(m)
|
||||
}
|
||||
|
||||
func Manifests() (map[model.Name]*Manifest, error) {
|
||||
func Manifests(continueOnError bool) (map[model.Name]*Manifest, error) {
|
||||
manifests, err := GetManifestPath()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -145,22 +145,29 @@ func Manifests() (map[model.Name]*Manifest, error) {
|
||||
if !fi.IsDir() {
|
||||
rel, err := filepath.Rel(manifests, match)
|
||||
if err != nil {
|
||||
if !continueOnError {
|
||||
return nil, fmt.Errorf("%s %w", match, err)
|
||||
}
|
||||
slog.Warn("bad filepath", "path", match, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
n := model.ParseNameFromFilepath(rel)
|
||||
if !n.IsValid() {
|
||||
if !continueOnError {
|
||||
return nil, fmt.Errorf("%s %w", rel, err)
|
||||
}
|
||||
slog.Warn("bad manifest name", "path", rel)
|
||||
continue
|
||||
}
|
||||
|
||||
m, err := ParseNamedManifest(n)
|
||||
if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
|
||||
if err != nil {
|
||||
if !continueOnError {
|
||||
return nil, fmt.Errorf("%s %w", n, err)
|
||||
}
|
||||
slog.Warn("bad manifest", "name", n, "error", err)
|
||||
continue
|
||||
} else if err != nil {
|
||||
return nil, fmt.Errorf("%s: %w", n, err)
|
||||
}
|
||||
|
||||
ms[n] = m
|
||||
|
||||
@@ -112,7 +112,7 @@ func TestManifests(t *testing.T) {
|
||||
createManifest(t, d, p)
|
||||
}
|
||||
|
||||
ms, err := Manifests()
|
||||
ms, err := Manifests(true)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -27,6 +27,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
|
||||
isMllama := checkMllamaModelFamily(m)
|
||||
|
||||
var imageNumTokens int
|
||||
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
||||
if isMllama {
|
||||
// Our mllama implementation packs all of the embeddings into a single token
|
||||
imageNumTokens = 1
|
||||
} else {
|
||||
// Clip images are represented as 768 tokens, each an embedding
|
||||
imageNumTokens = 768
|
||||
}
|
||||
|
||||
n := len(msgs) - 1
|
||||
// in reverse, find all messages that fit into context window
|
||||
for i := n; i >= 0; i-- {
|
||||
@@ -59,9 +69,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
ctxLen := len(s)
|
||||
if m.ProjectorPaths != nil {
|
||||
for _, m := range msgs[i:] {
|
||||
// images are represented as 768 sized embeddings
|
||||
// TODO: get embedding length from project metadata
|
||||
ctxLen += 768 * len(m.Images)
|
||||
ctxLen += imageNumTokens * len(m.Images)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -267,7 +267,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
prompt = b.String()
|
||||
}
|
||||
|
||||
slog.Debug("generate request", "prompt", prompt, "images", images)
|
||||
slog.Debug("generate request", "images", len(images), "prompt", prompt)
|
||||
|
||||
ch := make(chan any)
|
||||
go func() {
|
||||
@@ -622,7 +622,7 @@ func (s *Server) PushHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
func checkNameExists(name model.Name) error {
|
||||
names, err := Manifests()
|
||||
names, err := Manifests(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -894,7 +894,7 @@ func getKVData(digest string, verbose bool) (llm.KV, error) {
|
||||
}
|
||||
|
||||
func (s *Server) ListHandler(c *gin.Context) {
|
||||
ms, err := Manifests()
|
||||
ms, err := Manifests(true)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -1211,18 +1211,22 @@ func Serve(ln net.Listener) error {
|
||||
}
|
||||
|
||||
if !envconfig.NoPrune() {
|
||||
// clean up unused layers and manifests
|
||||
if err := PruneLayers(); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := Manifests(false); err != nil {
|
||||
slog.Warn("corrupt manifests detected, skipping prune operation. Re-pull or delete to clear", "error", err)
|
||||
} else {
|
||||
// clean up unused layers and manifests
|
||||
if err := PruneLayers(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
manifestsPath, err := GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
manifestsPath, err := GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := PruneDirectory(manifestsPath); err != nil {
|
||||
return err
|
||||
if err := PruneDirectory(manifestsPath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -130,11 +130,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
continue
|
||||
}
|
||||
numParallel := int(envconfig.NumParallel())
|
||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
||||
// TODO (jmorganca): mllama doesn't support parallel yet
|
||||
// see https://github.com/ollama/ollama/issues/4165
|
||||
if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
|
||||
if checkMllamaModelFamily(pending.model) && numParallel != 1 {
|
||||
numParallel = 1
|
||||
slog.Warn("multimodal models don't support parallel requests yet")
|
||||
slog.Warn("mllama doesn't support parallel requests yet")
|
||||
}
|
||||
|
||||
for {
|
||||
|
||||
Reference in New Issue
Block a user