Compare commits

..

6 Commits

Author SHA1 Message Date
Jeffrey Morgan
16a9006306 add back f16c instructions on intel mac 2023-11-26 15:59:49 -05:00
Jeffrey Morgan
e9216ea459 fix readline history on linux 2023-11-26 15:59:04 -05:00
Jeffrey Morgan
9e4a316405 update submodule commit 2023-11-26 14:52:00 -05:00
Jeffrey Morgan
9fb5e8399c Fix issues with inputting and formatting multi line strings in ollama run
Co-authored-by: Wen Sun <iwendellsun@gmail.com>
2023-11-26 12:54:29 -05:00
Jing Zhang
82b9b329ff windows CUDA support (#1262)
* Support cuda build in Windows
* Enable dynamic NumGPU allocation for Windows
2023-11-24 17:16:36 -05:00
Jongwook Choi
12e8c12d2b Disable CUDA peer access as a workaround for multi-gpu inference bug (#1261)
When CUDA peer access is enabled, multi-gpu inference will produce
garbage output. This is a known bug of llama.cpp (or nvidia). Until the
upstream bug is fixed, we can disable CUDA peer access temporarily
to ensure correct output.

See #961.
2023-11-24 14:05:57 -05:00
14 changed files with 59 additions and 49 deletions

1
.gitignore vendored
View File

@@ -7,3 +7,4 @@ dist
ollama
ggml-metal.metal
.cache
*.exe

View File

@@ -262,7 +262,7 @@ func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgre
func (c *Client) List(ctx context.Context) (*ListResponse, error) {
var lr ListResponse
if err := c.do(ctx, http.MethodGet, "/api/list", nil, &lr); err != nil {
if err := c.do(ctx, http.MethodGet, "/api/tags", nil, &lr); err != nil {
return nil, err
}
return &lr, nil

View File

@@ -157,7 +157,7 @@ def push(model_name, insecure=False, callback=None):
# List models that are available locally.
def list():
try:
response = requests.get(f"{BASE_URL}/api/list")
response = requests.get(f"{BASE_URL}/api/tags")
response.raise_for_status()
data = response.json()
models = data.get('models', [])

View File

@@ -602,14 +602,12 @@ func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format
fmt.Fprintln(os.Stderr, "")
}
prompt := readline.Prompt{
scanner, err := readline.New(readline.Prompt{
Prompt: ">>> ",
AltPrompt: "... ",
Placeholder: "Send a message (/? for help)",
AltPlaceholder: `Use """ to end multi-line input`,
}
scanner, err := readline.New(prompt)
})
if err != nil {
return err
}
@@ -617,7 +615,7 @@ func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format
fmt.Print(readline.StartBracketedPaste)
defer fmt.Printf(readline.EndBracketedPaste)
var multiLineBuffer string
var prompt string
for {
line, err := scanner.Readline()
@@ -630,27 +628,33 @@ func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format
fmt.Println("\nUse Ctrl-D or /bye to exit.")
}
scanner.Prompt.UseAlt = false
prompt = ""
continue
case err != nil:
return err
}
line = strings.TrimSpace(line)
switch {
case scanner.Prompt.UseAlt:
if strings.HasSuffix(line, `"""`) {
scanner.Prompt.UseAlt = false
multiLineBuffer += strings.TrimSuffix(line, `"""`)
line = multiLineBuffer
multiLineBuffer = ""
} else {
multiLineBuffer += line + " "
case strings.HasPrefix(prompt, `"""`):
// if the prompt so far starts with """ then we're in multiline mode
// and we need to keep reading until we find a line that ends with """
cut, found := strings.CutSuffix(line, `"""`)
prompt += cut + "\n"
if !found {
continue
}
case strings.HasPrefix(line, `"""`):
prompt = strings.TrimPrefix(prompt, `"""`)
scanner.Prompt.UseAlt = false
case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
scanner.Prompt.UseAlt = true
multiLineBuffer = strings.TrimPrefix(line, `"""`) + " "
prompt += line + "\n"
continue
case scanner.Pasting:
prompt += line + "\n"
continue
case strings.HasPrefix(line, "/list"):
args := strings.Fields(line)
@@ -757,12 +761,17 @@ func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format
case strings.HasPrefix(line, "/"):
args := strings.Fields(line)
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
continue
default:
prompt += line
}
if len(line) > 0 && line[0] != '/' {
if err := generate(cmd, model, line, wordWrap, format); err != nil {
if len(prompt) > 0 && prompt[0] != '/' {
if err := generate(cmd, model, prompt, wordWrap, format); err != nil {
return err
}
prompt = ""
}
}
}

View File

@@ -372,10 +372,10 @@ curl -T model.bin -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf08
Return 201 Created if the blob was successfully created.
## List Models
## List Local Models
```shell
GET /api/list
GET /api/tags
```
List models that are available locally.
@@ -385,7 +385,7 @@ List models that are available locally.
#### Request
```shell
curl http://localhost:11434/api/list
curl http://localhost:11434/api/tags
```
#### Response

View File

@@ -13,6 +13,6 @@ package llm
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner

View File

@@ -21,6 +21,6 @@ package llm
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner

View File

@@ -14,3 +14,11 @@ package llm
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate cmd /c move ggml\build\cuda\bin\Release\server.exe ggml\build\cuda\bin\Release\ollama-runner.exe
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate cmd /c move gguf\build\cuda\bin\Release\server.exe gguf\build\cuda\bin\Release\ollama-runner.exe

View File

@@ -84,6 +84,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
case "windows":
// TODO: select windows GPU runner here when available
runners = []ModelRunner{
{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
}
default:
@@ -269,7 +270,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
if runtime.GOOS == "linux" {
if runtime.GOOS == "linux" || runtime.GOOS == "windows" {
freeBytes, err := CheckVRAM()
if err != nil {
if !errors.Is(err, errNvidiaSMI) {

View File

@@ -43,9 +43,12 @@ func (h *History) Init() error {
}
path := filepath.Join(home, ".ollama", "history")
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
return err
}
h.Filename = path
//todo check if the file exists
f, err := os.OpenFile(path, os.O_CREATE|os.O_RDONLY, 0600)
if err != nil {
if errors.Is(err, os.ErrNotExist) {

View File

@@ -24,6 +24,7 @@ type Instance struct {
Prompt *Prompt
Terminal *Terminal
History *History
Pasting bool
}
func New(prompt Prompt) (*Instance, error) {
@@ -46,7 +47,7 @@ func New(prompt Prompt) (*Instance, error) {
func (i *Instance) Readline() (string, error) {
prompt := i.Prompt.Prompt
if i.Prompt.UseAlt {
if i.Prompt.UseAlt || i.Pasting {
prompt = i.Prompt.AltPrompt
}
fmt.Print(prompt)
@@ -63,12 +64,13 @@ func (i *Instance) Readline() (string, error) {
var esc bool
var escex bool
var metaDel bool
var pasteMode PasteMode
var currentLineBuf []rune
for {
if buf.IsEmpty() {
// don't show placeholder when pasting unless we're in multiline mode
showPlaceholder := !i.Pasting || i.Prompt.UseAlt
if buf.IsEmpty() && showPlaceholder {
ph := i.Prompt.Placeholder
if i.Prompt.UseAlt {
ph = i.Prompt.AltPlaceholder
@@ -119,9 +121,9 @@ func (i *Instance) Readline() (string, error) {
code += string(r)
}
if code == CharBracketedPasteStart {
pasteMode = PasteModeStart
i.Pasting = true
} else if code == CharBracketedPasteEnd {
pasteMode = PasteModeEnd
i.Pasting = false
}
case KeyDel:
if buf.Size() > 0 {
@@ -196,12 +198,7 @@ func (i *Instance) Readline() (string, error) {
}
buf.MoveToEnd()
fmt.Println()
switch pasteMode {
case PasteModeStart:
output = `"""` + output
case PasteModeEnd:
output = output + `"""`
}
return output, nil
default:
if metaDel {

View File

@@ -76,11 +76,3 @@ const (
CharBracketedPasteStart = "00~"
CharBracketedPasteEnd = "01~"
)
type PasteMode int
const (
PastModeOff = iota
PasteModeStart
PasteModeEnd
)

View File

@@ -771,7 +771,6 @@ func Serve(ln net.Listener, allowOrigins []string) error {
c.String(http.StatusOK, "Ollama is running")
})
r.Handle(method, "/api/list", ListModelsHandler)
r.Handle(method, "/api/tags", ListModelsHandler)
}