merge fixes
This commit is contained in:
parent
1e46db8748
commit
75f65bcdbf
240
llm/server.go
240
llm/server.go
|
|
@ -235,29 +235,35 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
|||
}
|
||||
|
||||
var gpuLibs []string
|
||||
newLibs := func(libDirs []string) (ret []string) {
|
||||
for _, d := range libDirs {
|
||||
found := false
|
||||
for _, t := range gpuLibs {
|
||||
if d == t {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
ret = append(ret, d)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, gpu := range gpus {
|
||||
gpuLibs = append(gpuLibs, newLibs(gpu.DependencyPath)...)
|
||||
gpuLibs = append(gpuLibs, gpu.RunnerName())
|
||||
}
|
||||
|
||||
requested := envconfig.LLMLibrary()
|
||||
if availableLibs[requested] != "" {
|
||||
slog.Info("using requested gpu library", "requested", requested)
|
||||
gpuLibs = []string{availableLibs[requested]}
|
||||
gpuLibs = []string{requested}
|
||||
}
|
||||
|
||||
var compatible []string
|
||||
for _, gpuLib := range gpuLibs {
|
||||
var matchingLibs []string
|
||||
for k := range availableLibs {
|
||||
// exact match first
|
||||
if k == gpuLib {
|
||||
matchingLibs = append([]string{k}, matchingLibs...)
|
||||
continue
|
||||
}
|
||||
|
||||
// then match the family (e.g. 'cuda')
|
||||
if strings.Split(k, "_")[0] == strings.Split(gpuLib, "_")[0] {
|
||||
matchingLibs = append(matchingLibs, k)
|
||||
}
|
||||
}
|
||||
|
||||
if len(matchingLibs) > 0 {
|
||||
compatible = append(compatible, matchingLibs[0])
|
||||
}
|
||||
}
|
||||
|
||||
exe, err := os.Executable()
|
||||
|
|
@ -269,36 +275,40 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
|||
exe = eval
|
||||
}
|
||||
|
||||
port := 0
|
||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
||||
var l *net.TCPListener
|
||||
if l, err = net.ListenTCP("tcp", a); err == nil {
|
||||
port = l.Addr().(*net.TCPAddr).Port
|
||||
l.Close()
|
||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
|
||||
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
|
||||
// without any LD_LIBRARY_PATH flags
|
||||
for {
|
||||
port := 0
|
||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
||||
var l *net.TCPListener
|
||||
if l, err = net.ListenTCP("tcp", a); err == nil {
|
||||
port = l.Addr().(*net.TCPAddr).Port
|
||||
l.Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
if port == 0 {
|
||||
slog.Debug("ResolveTCPAddr failed, using random port")
|
||||
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||
}
|
||||
params := []string{"runner"}
|
||||
if textProcessor != nil {
|
||||
// New engine
|
||||
// TODO - if we have failure to load scenarios, add logic to retry with the old runner
|
||||
params = append(params, "--ollama-engine")
|
||||
}
|
||||
params = append(params, "--model", modelPath)
|
||||
params = append(params, "--port", strconv.Itoa(port))
|
||||
if port == 0 {
|
||||
slog.Debug("ResolveTCPAddr failed, using random port")
|
||||
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||
}
|
||||
params := []string{"runner"}
|
||||
if textProcessor != nil {
|
||||
// New engine
|
||||
// TODO - if we have failure to load scenarios, add logic to retry with the old runner
|
||||
params = append(params, "--ollama-engine")
|
||||
}
|
||||
params = append(params, "--model", modelPath)
|
||||
params = append(params, "--port", strconv.Itoa(port))
|
||||
|
||||
var pathEnv string
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
pathEnv = "PATH"
|
||||
case "darwin":
|
||||
pathEnv = "DYLD_LIBRARY_PATH"
|
||||
default:
|
||||
pathEnv = "LD_LIBRARY_PATH"
|
||||
}
|
||||
var pathEnv string
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
pathEnv = "PATH"
|
||||
case "darwin":
|
||||
pathEnv = "DYLD_LIBRARY_PATH"
|
||||
default:
|
||||
pathEnv = "LD_LIBRARY_PATH"
|
||||
}
|
||||
|
||||
// Note: we always put our dependency paths first
|
||||
// since these are the exact version we compiled/linked against
|
||||
|
|
@ -344,80 +354,86 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
|||
done: make(chan error, 1),
|
||||
}
|
||||
|
||||
s.cmd.Env = os.Environ()
|
||||
s.cmd.Stdout = os.Stdout
|
||||
s.cmd.Stderr = s.status
|
||||
s.cmd.SysProcAttr = LlamaServerSysProcAttr
|
||||
s.cmd.Env = os.Environ()
|
||||
s.cmd.Stdout = os.Stdout
|
||||
s.cmd.Stderr = s.status
|
||||
s.cmd.SysProcAttr = LlamaServerSysProcAttr
|
||||
|
||||
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(gpuLibs, string(filepath.ListSeparator)))
|
||||
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
|
||||
|
||||
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
|
||||
envWorkarounds := gpus.GetVisibleDevicesEnv()
|
||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||
|
||||
// Update or add the path variable with our adjusted version
|
||||
pathNeeded := true
|
||||
envWorkaroundDone := make([]bool, len(envWorkarounds))
|
||||
for i := range s.cmd.Env {
|
||||
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
|
||||
if strings.EqualFold(cmp[0], pathEnv) {
|
||||
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
||||
pathNeeded = false
|
||||
} else if len(envWorkarounds) != 0 {
|
||||
for j, kv := range envWorkarounds {
|
||||
tmp := strings.SplitN(kv, "=", 2)
|
||||
if strings.EqualFold(cmp[0], tmp[0]) {
|
||||
s.cmd.Env[i] = kv
|
||||
envWorkaroundDone[j] = true
|
||||
// Update or add the path variable with our adjusted version
|
||||
pathNeeded := true
|
||||
envWorkaroundDone := make([]bool, len(envWorkarounds))
|
||||
for i := range s.cmd.Env {
|
||||
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
|
||||
if strings.EqualFold(cmp[0], pathEnv) {
|
||||
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
||||
pathNeeded = false
|
||||
} else if len(envWorkarounds) != 0 {
|
||||
for j, kv := range envWorkarounds {
|
||||
tmp := strings.SplitN(kv, "=", 2)
|
||||
if strings.EqualFold(cmp[0], tmp[0]) {
|
||||
s.cmd.Env[i] = kv
|
||||
envWorkaroundDone[j] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if pathNeeded {
|
||||
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
|
||||
}
|
||||
for i, done := range envWorkaroundDone {
|
||||
if !done {
|
||||
s.cmd.Env = append(s.cmd.Env, envWorkarounds[i])
|
||||
if pathNeeded {
|
||||
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("starting runner", "cmd", s.cmd)
|
||||
slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
|
||||
|
||||
if err = s.cmd.Start(); err != nil {
|
||||
var msg string
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
err := fmt.Errorf("error starting runner: %v %s", err, msg)
|
||||
if llamaModel != nil {
|
||||
llama.FreeModel(llamaModel)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// reap subprocess when it exits
|
||||
go func() {
|
||||
err := s.cmd.Wait()
|
||||
// Favor a more detailed message over the process exit status
|
||||
if err != nil && s.status != nil && s.status.LastErrMsg != "" {
|
||||
slog.Error("llama runner terminated", "error", err)
|
||||
if strings.Contains(s.status.LastErrMsg, "unknown model") {
|
||||
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
|
||||
for i, done := range envWorkaroundDone {
|
||||
if !done {
|
||||
s.cmd.Env = append(s.cmd.Env, envWorkarounds[i])
|
||||
}
|
||||
s.done <- errors.New(s.status.LastErrMsg)
|
||||
} else {
|
||||
s.done <- err
|
||||
}
|
||||
}()
|
||||
|
||||
if textProcessor != nil {
|
||||
return &ollamaServer{llmServer: s}, nil
|
||||
} else {
|
||||
return &llamaServer{llmServer: s, ggml: f}, nil
|
||||
slog.Info("starting runner", "cmd", s.cmd)
|
||||
slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
|
||||
|
||||
if err = s.cmd.Start(); err != nil {
|
||||
var msg string
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
err := fmt.Errorf("error starting runner: %v %s", err, msg)
|
||||
if len(compatible) == 0 {
|
||||
if llamaModel != nil {
|
||||
llama.FreeModel(llamaModel)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
|
||||
compatible = compatible[1:]
|
||||
continue
|
||||
}
|
||||
|
||||
// reap subprocess when it exits
|
||||
go func() {
|
||||
err := s.cmd.Wait()
|
||||
// Favor a more detailed message over the process exit status
|
||||
if err != nil && s.status != nil && s.status.LastErrMsg != "" {
|
||||
slog.Error("llama runner terminated", "error", err)
|
||||
if strings.Contains(s.status.LastErrMsg, "unknown model") {
|
||||
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
|
||||
}
|
||||
s.done <- errors.New(s.status.LastErrMsg)
|
||||
} else {
|
||||
s.done <- err
|
||||
}
|
||||
}()
|
||||
|
||||
if textProcessor != nil {
|
||||
return &ollamaServer{llmServer: s}, nil
|
||||
} else {
|
||||
return &llamaServer{llmServer: s, ggml: f}, nil
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (s *llmServer) ModelPath() string {
|
||||
|
|
@ -545,9 +561,8 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
|||
// For CPU loads we want the memory to be allocated, not FS cache
|
||||
if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
|
||||
(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
|
||||
(gpus[0].Library == "vulkan" && s.options.UseMMap == nil) ||
|
||||
(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
|
||||
(gpus[0].Library == "VULKAN" && s.options.UseMMap == nil) ||
|
||||
(gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
|
||||
(s.options.UseMMap != nil && !*s.options.UseMMap) {
|
||||
s.loadRequest.UseMmap = false
|
||||
}
|
||||
|
|
@ -1313,6 +1328,17 @@ func (s *llmServer) Pid() int {
|
|||
return -1
|
||||
}
|
||||
|
||||
func (s *llmServer) GetPort() int {
|
||||
return s.port
|
||||
}
|
||||
|
||||
func (s *llmServer) HasExited() bool {
|
||||
if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var grammarJSON = `
|
||||
root ::= object
|
||||
value ::= object | array | string | number | ("true" | "false" | "null") ws
|
||||
|
|
|
|||
Loading…
Reference in New Issue