Compare commits
1 Commits
v0.1.20
...
cuda-searc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
be721ca0df |
179
gpu/gpu.go
179
gpu/gpu.go
@@ -13,10 +13,7 @@ import "C"
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
)
|
||||
@@ -32,79 +29,31 @@ var gpuHandles *handles = nil
|
||||
// With our current CUDA compile flags, 5.2 and older will not work properly
|
||||
const CudaComputeMajorMin = 6
|
||||
|
||||
// Possible locations for the nvidia-ml library
|
||||
var CudaLinuxGlobs = []string{
|
||||
"/usr/local/cuda/lib64/libnvidia-ml.so*",
|
||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so*",
|
||||
"/opt/cuda/lib64/libnvidia-ml.so*",
|
||||
"/usr/lib*/libnvidia-ml.so*",
|
||||
"/usr/local/lib*/libnvidia-ml.so*",
|
||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
|
||||
}
|
||||
|
||||
var CudaWindowsGlobs = []string{
|
||||
"c:\\Windows\\System32\\nvml.dll",
|
||||
}
|
||||
|
||||
var RocmLinuxGlobs = []string{
|
||||
"/opt/rocm*/lib*/librocm_smi64.so*",
|
||||
}
|
||||
|
||||
var RocmWindowsGlobs = []string{
|
||||
"c:\\Windows\\System32\\rocm_smi64.dll",
|
||||
}
|
||||
|
||||
// Note: gpuMutex must already be held
|
||||
func initGPUHandles() {
|
||||
|
||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||
|
||||
var cudaMgmtName string
|
||||
var cudaMgmtPatterns []string
|
||||
var rocmMgmtName string
|
||||
var rocmMgmtPatterns []string
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
cudaMgmtName = "nvml.dll"
|
||||
cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
|
||||
copy(cudaMgmtPatterns, CudaWindowsGlobs)
|
||||
rocmMgmtName = "rocm_smi64.dll"
|
||||
rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
|
||||
copy(rocmMgmtPatterns, RocmWindowsGlobs)
|
||||
case "linux":
|
||||
cudaMgmtName = "libnvidia-ml.so"
|
||||
cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
|
||||
copy(cudaMgmtPatterns, CudaLinuxGlobs)
|
||||
rocmMgmtName = "librocm_smi64.so"
|
||||
rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
|
||||
copy(rocmMgmtPatterns, RocmLinuxGlobs)
|
||||
default:
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("Detecting GPU type")
|
||||
gpuHandles = &handles{nil, nil}
|
||||
cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
|
||||
if len(cudaLibPaths) > 0 {
|
||||
cuda := LoadCUDAMgmt(cudaLibPaths)
|
||||
if cuda != nil {
|
||||
log.Printf("Nvidia GPU detected")
|
||||
gpuHandles.cuda = cuda
|
||||
return
|
||||
}
|
||||
}
|
||||
var resp C.cuda_init_resp_t
|
||||
C.cuda_init(&resp)
|
||||
if resp.err != nil {
|
||||
log.Printf("CUDA not detected: %s", C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
|
||||
rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
|
||||
if len(rocmLibPaths) > 0 {
|
||||
rocm := LoadROCMMgmt(rocmLibPaths)
|
||||
if rocm != nil {
|
||||
var resp C.rocm_init_resp_t
|
||||
C.rocm_init(&resp)
|
||||
if resp.err != nil {
|
||||
log.Printf("ROCm not detected: %s", C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
log.Printf("Radeon GPU detected")
|
||||
gpuHandles.rocm = rocm
|
||||
return
|
||||
rocm := resp.rh
|
||||
gpuHandles.rocm = &rocm
|
||||
}
|
||||
} else {
|
||||
log.Printf("Nvidia GPU detected")
|
||||
cuda := resp.ch
|
||||
gpuHandles.cuda = &cuda
|
||||
}
|
||||
}
|
||||
|
||||
@@ -184,99 +133,13 @@ func getCPUMem() (memInfo, error) {
|
||||
func CheckVRAM() (int64, error) {
|
||||
gpuInfo := GetGPUInfo()
|
||||
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
||||
// leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead
|
||||
overhead := gpuInfo.FreeMemory / 10
|
||||
gpus := uint64(gpuInfo.DeviceCount)
|
||||
if overhead < gpus*512*1024*1024 {
|
||||
overhead = gpus * 512 * 1024 * 1024
|
||||
// leave 10% or 384Mi of VRAM free for unaccounted for overhead
|
||||
overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10
|
||||
if overhead < 384*1024*1024 {
|
||||
overhead = 384 * 1024 * 1024
|
||||
}
|
||||
return int64(gpuInfo.FreeMemory - overhead), nil
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
||||
}
|
||||
|
||||
func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||
var ldPaths []string
|
||||
gpuLibPaths := []string{}
|
||||
log.Printf("Searching for GPU management library %s", baseLibName)
|
||||
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
ldPaths = strings.Split(os.Getenv("PATH"), ";")
|
||||
case "linux":
|
||||
ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
|
||||
default:
|
||||
return gpuLibPaths
|
||||
}
|
||||
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
|
||||
for _, ldPath := range ldPaths {
|
||||
d, err := filepath.Abs(ldPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
||||
}
|
||||
for _, pattern := range patterns {
|
||||
// Ignore glob discovery errors
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
for _, match := range matches {
|
||||
// Resolve any links so we don't try the same lib multiple times
|
||||
// and weed out any dups across globs
|
||||
libPath := match
|
||||
tmp := match
|
||||
var err error
|
||||
for ; err == nil; tmp, err = os.Readlink(libPath) {
|
||||
if !filepath.IsAbs(tmp) {
|
||||
tmp = filepath.Join(filepath.Dir(libPath), tmp)
|
||||
}
|
||||
libPath = tmp
|
||||
}
|
||||
new := true
|
||||
for _, cmp := range gpuLibPaths {
|
||||
if cmp == libPath {
|
||||
new = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if new {
|
||||
gpuLibPaths = append(gpuLibPaths, libPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
log.Printf("Discovered GPU libraries: %v", gpuLibPaths)
|
||||
return gpuLibPaths
|
||||
}
|
||||
|
||||
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
|
||||
var resp C.cuda_init_resp_t
|
||||
for _, libPath := range cudaLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.cuda_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
log.Printf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return &resp.ch
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
|
||||
var resp C.rocm_init_resp_t
|
||||
for _, libPath := range rocmLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.rocm_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
log.Printf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return &resp.rh
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -4,9 +4,33 @@
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#ifndef _WIN32
|
||||
const char *cuda_lib_paths[] = {
|
||||
"libnvidia-ml.so",
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so", // TODO Maybe glob?
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so.1",
|
||||
"/usr/local/cuda/lib64/libnvidia-ml.so",
|
||||
"/usr/lib/libnvidia-ml.so",
|
||||
"/usr/lib/libnvidia-ml.so.1",
|
||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so",
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
|
||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
|
||||
NULL,
|
||||
};
|
||||
#else
|
||||
const char *cuda_lib_paths[] = {
|
||||
"nvml.dll",
|
||||
"",
|
||||
NULL,
|
||||
};
|
||||
#endif
|
||||
|
||||
#define CUDA_LOOKUP_SIZE 6
|
||||
|
||||
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||
void cuda_init(cuda_init_resp_t *resp) {
|
||||
nvmlReturn_t ret;
|
||||
resp->err = NULL;
|
||||
const int buflen = 256;
|
||||
@@ -25,12 +49,16 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
|
||||
};
|
||||
|
||||
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
|
||||
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
|
||||
resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
|
||||
}
|
||||
if (!resp->ch.handle) {
|
||||
// TODO improve error message, as the LOAD_ERR will have typically have the
|
||||
// final path that was checked which might be confusing.
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||
cuda_lib_path, msg);
|
||||
cuda_lib_paths[0], msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
@@ -52,8 +80,6 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||
|
||||
ret = (*resp->ch.initFn)();
|
||||
if (ret != NVML_SUCCESS) {
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ typedef struct cuda_compute_capability {
|
||||
int minor;
|
||||
} cuda_compute_capability_t;
|
||||
|
||||
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
|
||||
void cuda_init(cuda_init_resp_t *resp);
|
||||
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
|
||||
void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
|
||||
|
||||
|
||||
@@ -4,7 +4,22 @@
|
||||
|
||||
#include <string.h>
|
||||
|
||||
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||
#ifndef _WIN32
|
||||
const char *rocm_lib_paths[] = {
|
||||
"librocm_smi64.so",
|
||||
"/opt/rocm/lib/librocm_smi64.so",
|
||||
NULL,
|
||||
};
|
||||
#else
|
||||
// TODO untested
|
||||
const char *rocm_lib_paths[] = {
|
||||
"rocm_smi64.dll",
|
||||
"/opt/rocm/lib/rocm_smi64.dll",
|
||||
NULL,
|
||||
};
|
||||
#endif
|
||||
|
||||
void rocm_init(rocm_init_resp_t *resp) {
|
||||
rsmi_status_t ret;
|
||||
resp->err = NULL;
|
||||
const int buflen = 256;
|
||||
@@ -21,12 +36,14 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
|
||||
};
|
||||
|
||||
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
|
||||
for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
|
||||
resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
|
||||
}
|
||||
if (!resp->rh.handle) {
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Radeon GPUs: %s\n",
|
||||
rocm_lib_path, msg);
|
||||
rocm_lib_paths[0], msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
@@ -36,7 +53,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
|
||||
if (!l[i].p) {
|
||||
UNLOAD_LIBRARY(resp->rh.handle);
|
||||
resp->rh.handle = NULL;
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||
msg);
|
||||
@@ -48,8 +64,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||
|
||||
ret = (*resp->rh.initFn)(0);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
UNLOAD_LIBRARY(resp->rh.handle);
|
||||
resp->rh.handle = NULL;
|
||||
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
}
|
||||
@@ -69,7 +83,7 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||
int i;
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("rocm handle not initialized");
|
||||
resp->err = strdup("nvml handle sn't initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ typedef struct rocm_init_resp {
|
||||
rocm_handle_t rh;
|
||||
} rocm_init_resp_t;
|
||||
|
||||
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
|
||||
void rocm_init(rocm_init_resp_t *resp);
|
||||
void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
|
||||
|
||||
#endif // __GPU_INFO_ROCM_H__
|
||||
|
||||
@@ -111,10 +111,6 @@ void llama_server_stop() {
|
||||
// TODO - too verbose, remove once things are solid
|
||||
LOG_TEE("requesting llama server shutdown\n");
|
||||
ext_server_running = false;
|
||||
|
||||
// unblocks the update_slots() loop so it can clean up and exit
|
||||
llama->request_cancel(0);
|
||||
|
||||
ext_server_thread.join();
|
||||
delete llama;
|
||||
llama = NULL;
|
||||
|
||||
129
llm/llm.go
129
llm/llm.go
@@ -2,6 +2,7 @@ package llm
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"runtime"
|
||||
@@ -40,76 +41,88 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
vram, _ := gpu.CheckVRAM()
|
||||
size := ggml.Size
|
||||
fmt.Println("size", ggml.Size)
|
||||
fmt.Println("filetype", ggml.FileType())
|
||||
fmt.Println("architecture", ggml.ModelFamily())
|
||||
fmt.Println("type", ggml.ModelType())
|
||||
fmt.Println("name", ggml.Name())
|
||||
fmt.Println("embd", ggml.NumEmbed())
|
||||
fmt.Println("head", ggml.NumHead())
|
||||
fmt.Println("head_kv", ggml.NumHeadKv())
|
||||
fmt.Println("gqa", ggml.NumGQA())
|
||||
|
||||
available, _ := gpu.CheckVRAM()
|
||||
|
||||
// For now assume filesize = model size
|
||||
// TODO: use actual model size
|
||||
requiredModel := ggml.Size
|
||||
|
||||
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
||||
requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
||||
|
||||
// this amount is the overhead + tensors in memory
|
||||
// TODO: get this from the llama.cpp's graph calcluations instead of
|
||||
// estimating it's 1/6 * kv_cache_size * num_gqa
|
||||
graph := int64(ggml.NumGQA()) * kv / 6
|
||||
requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
|
||||
|
||||
requiredTotal := requiredModel + requiredKv + requiredAlloc
|
||||
|
||||
log.Println("system memory bytes:", available)
|
||||
log.Println("required model bytes:", requiredModel)
|
||||
log.Println("required kv bytes:", requiredKv)
|
||||
log.Println("required alloc bytes:", requiredAlloc)
|
||||
log.Println("required total bytes:", requiredTotal)
|
||||
|
||||
info := gpu.GetGPUInfo()
|
||||
library := info.Library
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
if opts.NumGPU == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
if size+kv+graph > vram {
|
||||
log.Println("not enough vram available, falling back to CPU only")
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
opts.NumGPU = 1
|
||||
default:
|
||||
if library == "cpu" || library == "default" {
|
||||
log.Println("GPU not available, falling back to CPU")
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
// don't use GPU at all if no layers are loaded
|
||||
if opts.NumGPU == 0 {
|
||||
library = "cpu"
|
||||
break
|
||||
}
|
||||
|
||||
// user-defined GPU count
|
||||
if opts.NumGPU != -1 {
|
||||
break
|
||||
}
|
||||
|
||||
// the "main" GPU needs the most memory and determines the limit
|
||||
// of how many layers can be loaded. It needs to fit:
|
||||
// 1. the full compute graph allocation for all devices (graph)
|
||||
// 2. the proportional kv cache for all devices (kv * % layers)
|
||||
// 3. the proportional model (size * % layers / # devices)
|
||||
// This estimates the number of layers
|
||||
maxlayers := int64(ggml.NumLayers()) + 1
|
||||
devices := int64(info.DeviceCount)
|
||||
avg := vram / devices
|
||||
layers := maxlayers * (avg - graph) / (kv + size/devices)
|
||||
if layers > maxlayers {
|
||||
layers = maxlayers
|
||||
}
|
||||
|
||||
// 1 + 2 must fit on the main gpu
|
||||
min := graph + kv*layers/maxlayers
|
||||
if layers <= 0 || min > avg {
|
||||
log.Printf("not enough vram available, falling back to CPU only")
|
||||
library = "cpu"
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
opts.NumGPU = int(layers)
|
||||
if opts.NumGPU == -1 {
|
||||
// default to offloading all layers
|
||||
opts.NumGPU = int(ggml.NumLayers()) + 1
|
||||
}
|
||||
|
||||
// decide how many layers to put on the GPU
|
||||
if opts.NumGPU > 0 {
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
if requiredTotal > available {
|
||||
log.Println("not enough vram available, falling back to CPU only")
|
||||
opts.NumGPU = 0
|
||||
}
|
||||
default:
|
||||
if library == "cpu" || library == "default" {
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
// no offloading required
|
||||
if requiredTotal <= available {
|
||||
break
|
||||
}
|
||||
|
||||
// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
|
||||
if requiredAlloc > available {
|
||||
log.Printf("not enough vram available, falling back to CPU only")
|
||||
library = "cpu"
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
available -= requiredAlloc
|
||||
|
||||
// fill remaining vram with layers
|
||||
log.Println("splitting", available, "of available memory bytes into layers")
|
||||
bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
|
||||
log.Println("bytes per layer:", bytesPerLayer)
|
||||
layers := available / bytesPerLayer
|
||||
log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
|
||||
if layers < int64(opts.NumGPU) {
|
||||
opts.NumGPU = int(layers)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
opts.NumGQA = 0
|
||||
opts.RopeFrequencyBase = 0.0
|
||||
opts.RopeFrequencyScale = 0.0
|
||||
return newLlmServer(library, model, adapters, projectors, opts)
|
||||
|
||||
@@ -5,10 +5,9 @@ set -eu
|
||||
export VERSION=${VERSION:-0.0.0}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
|
||||
|
||||
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
||||
mkdir -p dist
|
||||
|
||||
for TARGETARCH in ${BUILD_ARCH}; do
|
||||
for TARGETARCH in amd64 arm64; do
|
||||
docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
|
||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
||||
|
||||
Reference in New Issue
Block a user