types/model: require all names parts start with an alnum char

2024-04-24 11:54:49 -07:00
17 changed files with 240 additions and 314 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -311,18 +311,29 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cpu
+          path: |
+            llm/build
+            dist/windows-amd64
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cuda
+          path: |
+            llm/build
+            dist/windows-amd64
      - uses: actions/download-artifact@v4
        with:
          name: windows-cuda-deps
+          path: dist/deps
      - uses: actions/download-artifact@v4
        with:
          name: windows-rocm-deps
+          path: dist/deps
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
+          path: |
+            llm/build
+            dist/windows-amd64
      - run: dir llm/build
      - run: |
          $gopath=(get-command go).source | split-path -parent
@@ -331,6 +342,8 @@ jobs:
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_GENERATE="1"
+          $env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
+          $env:HIP_PATH=$(resolve-path ".\dist\deps")
          & .\scripts\build_windows.ps1
      - uses: actions/upload-artifact@v4
        with:
--- a/api/types.go
+++ b/api/types.go
@@ -396,10 +396,8 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 func DefaultOptions() Options {
 	return Options{
 		// options set on request to runner
-		NumPredict: -1,
-
-		// set a minimal num_keep to avoid issues on context shifts
-		NumKeep:          4,
+		NumPredict:       -1,
+		NumKeep:          0,
 		Temperature:      0.8,
 		TopK:             40,
 		TopP:             0.9,
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -92,8 +92,12 @@ Source: "..\dist\windows-amd64\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64
 Source: "..\dist\windows-amd64\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-#if DirExists("..\dist\windows-amd64\rocm")
-  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
+; Assumes v5.7, may need adjustments for v6
+#if GetEnv("HIP_PATH") != ""
+  Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
+  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
+  ; amdhip64.dll dependency comes from the driver and must be installed already
+  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
 #endif


--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -17,7 +17,6 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
-	"regexp"
 	"runtime"
 	"strings"
 	"syscall"
@@ -54,6 +53,8 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()

+	bars := make(map[string]*progress.Bar)
+
 	modelfile, err := os.ReadFile(filename)
 	if err != nil {
 		return err
@@ -94,16 +95,95 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				return err
 			}

+			// TODO make this work w/ adapters
 			if fi.IsDir() {
-				// this is likely a safetensors or pytorch directory
-				// TODO make this work w/ adapters
-				tempfile, err := tempZipFiles(path)
+				tf, err := os.CreateTemp("", "ollama-tf")
 				if err != nil {
 					return err
 				}
-				defer os.RemoveAll(tempfile)
+				defer os.RemoveAll(tf.Name())

-				path = tempfile
+				zf := zip.NewWriter(tf)
+
+				files := []string{}
+
+				tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
+				if err != nil {
+					return err
+				} else if len(tfiles) == 0 {
+					tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
+					if err != nil {
+						return err
+					}
+				}
+
+				files = append(files, tfiles...)
+
+				if len(files) == 0 {
+					return fmt.Errorf("no models were found in '%s'", path)
+				}
+
+				// add the safetensor/torch config file + tokenizer
+				files = append(files, filepath.Join(path, "config.json"))
+				files = append(files, filepath.Join(path, "params.json"))
+				files = append(files, filepath.Join(path, "added_tokens.json"))
+				files = append(files, filepath.Join(path, "tokenizer.model"))
+
+				for _, fn := range files {
+					f, err := os.Open(fn)
+
+					// just skip whatever files aren't there
+					if os.IsNotExist(err) {
+						if strings.HasSuffix(fn, "tokenizer.model") {
+							// try the parent dir before giving up
+							parentDir := filepath.Dir(path)
+							newFn := filepath.Join(parentDir, "tokenizer.model")
+							f, err = os.Open(newFn)
+							if os.IsNotExist(err) {
+								continue
+							} else if err != nil {
+								return err
+							}
+						} else {
+							continue
+						}
+					} else if err != nil {
+						return err
+					}
+
+					fi, err := f.Stat()
+					if err != nil {
+						return err
+					}
+
+					h, err := zip.FileInfoHeader(fi)
+					if err != nil {
+						return err
+					}
+
+					h.Name = filepath.Base(fn)
+					h.Method = zip.Store
+
+					w, err := zf.CreateHeader(h)
+					if err != nil {
+						return err
+					}
+
+					_, err = io.Copy(w, f)
+					if err != nil {
+						return err
+					}
+
+				}
+
+				if err := zf.Close(); err != nil {
+					return err
+				}
+
+				if err := tf.Close(); err != nil {
+					return err
+				}
+				path = tf.Name()
 			}

 			digest, err := createBlob(cmd, client, path)
@@ -111,17 +191,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				return err
 			}

-			name := c.Name
-			if c.Name == "model" {
-				name = "from"
-			}
-
-			re := regexp.MustCompile(fmt.Sprintf(`(?im)^(%s)\s+%s\s*$`, name, c.Args))
-			modelfile = re.ReplaceAll(modelfile, []byte("$1 @"+digest))
+			modelfile = bytes.ReplaceAll(modelfile, []byte(c.Args), []byte("@"+digest))
 		}
 	}

-	bars := make(map[string]*progress.Bar)
 	fn := func(resp api.ProgressResponse) error {
 		if resp.Digest != "" {
 			spinner.Stop()
@@ -155,88 +228,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }

-func tempZipFiles(path string) (string, error) {
-	tempfile, err := os.CreateTemp("", "ollama-tf")
-	if err != nil {
-		return "", err
-	}
-	defer tempfile.Close()
-
-	zipfile := zip.NewWriter(tempfile)
-	defer zipfile.Close()
-
-	tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
-	if err != nil {
-		return "", err
-	} else if len(tfiles) == 0 {
-		tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
-		if err != nil {
-			return "", err
-		}
-	}
-
-	files := []string{}
-	files = append(files, tfiles...)
-
-	if len(files) == 0 {
-		return "", fmt.Errorf("no models were found in '%s'", path)
-	}
-
-	// add the safetensor/torch config file + tokenizer
-	files = append(files, filepath.Join(path, "config.json"))
-	files = append(files, filepath.Join(path, "params.json"))
-	files = append(files, filepath.Join(path, "added_tokens.json"))
-	files = append(files, filepath.Join(path, "tokenizer.model"))
-
-	for _, fn := range files {
-		f, err := os.Open(fn)
-
-		// just skip whatever files aren't there
-		if os.IsNotExist(err) {
-			if strings.HasSuffix(fn, "tokenizer.model") {
-				// try the parent dir before giving up
-				parentDir := filepath.Dir(path)
-				newFn := filepath.Join(parentDir, "tokenizer.model")
-				f, err = os.Open(newFn)
-				if os.IsNotExist(err) {
-					continue
-				} else if err != nil {
-					return "", err
-				}
-			} else {
-				continue
-			}
-		} else if err != nil {
-			return "", err
-		}
-
-		fi, err := f.Stat()
-		if err != nil {
-			return "", err
-		}
-
-		h, err := zip.FileInfoHeader(fi)
-		if err != nil {
-			return "", err
-		}
-
-		h.Name = filepath.Base(fn)
-		h.Method = zip.Store
-
-		w, err := zipfile.CreateHeader(h)
-		if err != nil {
-			return "", err
-		}
-
-		_, err = io.Copy(w, f)
-		if err != nil {
-			return "", err
-		}
-	}
-
-	return tempfile.Name(), nil
-}
-
 func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
 	bin, err := os.Open(path)
 	if err != nil {
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -21,7 +21,7 @@ init_vars() {
        # TODO - add additional optimization flags...
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
    fi
-    case $(uname -s) in
+    case $(uname -s) in 
    "Darwin")
        LIB_EXT="dylib"
        WHOLE_ARCHIVE="-Wl,-force_load"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -165,11 +165,11 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    fi
    if [ "${ARCH}" == "arm64" ]; then
        echo "ARM CPU detected - disabling unsupported AVX instructions"
-
+        
        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
        #
-        # CUDA compute < 6.0 lacks proper FP16 support on ARM.
-        # Disabling has minimal performance effect while maintaining compatibility.
+        # CUDA compute < 6.0 lacks proper FP16 support on ARM. 
+        # Disabling has minimal performance effect while maintaining compatibility. 
        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
    fi
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -252,11 +252,6 @@ if ($null -ne $script:CUDA_LIB_DIR) {
    build
    sign
    install
-
-    write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-    cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-    cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-    cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
 }

 if ($null -ne $env:HIP_PATH) {
@@ -300,14 +295,6 @@ if ($null -ne $env:HIP_PATH) {
    }
    sign
    install
-
-    # Assumes v5.7, may need adjustments for v6
-    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
-    cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-    cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-    # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-    cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
 }


--- a/llm/memory.go
+++ b/llm/memory.go
@@ -5,6 +5,7 @@ import (
 	"log/slog"
 	"os"
 	"strconv"
+	"strings"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
@@ -99,22 +100,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		return 0, 0
 	}

-	layers := ggml.Tensors().Layers()
-
-	var memoryLayerOutput uint64
-	for k, v := range layers {
-		if k == "output" || k == "output_norm" {
-			memoryLayerOutput += v.size()
-		}
-	}
-
-	if gpus[0].Library == "metal" && opts.UseMMap {
-		// memory is preallocated for output tensors
-		memoryRequiredTotal += memoryLayerOutput
-		memoryRequiredPartial += memoryLayerOutput
-	}
-
 	var layerCount int
+	layers := ggml.Tensors().Layers()
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
 		memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()

@@ -128,11 +115,15 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		}
 	}

-	if gpus[0].Library != "metal" || !opts.UseMMap {
-		// memory was not preallocated for output tensors
-		memoryRequiredTotal += memoryLayerOutput
+	var memoryLayerOutput uint64
+	for k, v := range layers {
+		if !strings.HasPrefix(k, "blk.") {
+			memoryLayerOutput += v.size()
+		}
 	}

+	memoryRequiredTotal += memoryLayerOutput
+
 	if memoryAvailable > memoryRequiredTotal {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
--- a/llm/patches/04-metal.diff
+++ b/llm/patches/04-metal.diff
@@ -1,45 +0,0 @@
-diff --git a/ggml-metal.m b/ggml-metal.m
-index 0207b787..b5e9884b 100644
--- a/ggml-metal.m
-+++ b/ggml-metal.m
-@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
-                         // to the matrix-vector kernel
-                         int ne11_mm_min = 1;
- 
-#if 0
-                         // the numbers below are measured on M2 Ultra for 7B and 13B models
-                         // these numbers do not translate to other devices or model sizes
-                         // TODO: need to find a better approach
-                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                            switch (src0t) {
-                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q4_0:
-                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                case GGML_TYPE_Q5_0:                          // not tested yet
-                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                default:             ne11_mm_min = 1;  break;
-                            }
-+                        switch (src0t) {
-+                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-+                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q4_0:
-+                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-+                            case GGML_TYPE_Q5_0:                          // not tested yet
-+                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-+                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-+                            default:             ne11_mm_min = 1;  break;
-                         }
-#endif
- 
-                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
--- a/llm/server.go
+++ b/llm/server.go
@@ -560,13 +560,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		return err
 	}
 	defer s.sem.Release(1)
-
-	// only allow maximum 10 "context shifts" to avoid infinite generation
-	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
-		req.Options.NumPredict = 10 * s.options.NumCtx
-		slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
-	}
-
 	request := map[string]any{
 		"prompt":            req.Prompt,
 		"stream":            true,
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -109,6 +109,9 @@ function gatherDependencies() {
    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"

+    cp "${script:NVIDIA_DIR}\cudart64_*.dll" "${script:DEPS_DIR}\"
+    cp "${script:NVIDIA_DIR}\cublas64_*.dll" "${script:DEPS_DIR}\"
+    cp "${script:NVIDIA_DIR}\cublasLt64_*.dll" "${script:DEPS_DIR}\"

    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
    if ("${env:KEY_CONTAINER}") {
@@ -120,6 +123,15 @@ function gatherDependencies() {
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
+    if ($null -ne $env:HIP_PATH) {
+        # Assumes v5.7, may need adjustments for v6
+        rm -ea 0 -recurse -force -path "${script:DEPS_DIR}\rocm\"
+        md "${script:DEPS_DIR}\rocm\rocblas\library\" -ea 0 > $null
+        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:DEPS_DIR}\rocm\"
+        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:DEPS_DIR}\rocm\"
+        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
+        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:DEPS_DIR}\rocm\rocblas\library\"
+    }
 }

 function buildInstaller() {
--- a/server/images.go
+++ b/server/images.go
@@ -29,7 +29,6 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
-	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )

@@ -702,32 +701,36 @@ func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string
 	return path, nil
 }

-func CopyModel(src, dst model.Name) error {
-	manifests, err := GetManifestPath()
+func CopyModel(src, dest string) error {
+	srcModelPath := ParseModelPath(src)
+	srcPath, err := srcModelPath.GetManifestPath()
 	if err != nil {
 		return err
 	}

-	dstpath := filepath.Join(manifests, dst.FilepathNoBuild())
-	if err := os.MkdirAll(filepath.Dir(dstpath), 0o755); err != nil {
-		return err
-	}
-
-	srcpath := filepath.Join(manifests, src.FilepathNoBuild())
-	srcfile, err := os.Open(srcpath)
+	destModelPath := ParseModelPath(dest)
+	destPath, err := destModelPath.GetManifestPath()
 	if err != nil {
 		return err
 	}
-	defer srcfile.Close()
-
-	dstfile, err := os.Create(dstpath)
-	if err != nil {
+	if err := os.MkdirAll(filepath.Dir(destPath), 0o755); err != nil {
 		return err
 	}
-	defer dstfile.Close()

-	_, err = io.Copy(dstfile, srcfile)
-	return err
+	// copy the file
+	input, err := os.ReadFile(srcPath)
+	if err != nil {
+		fmt.Println("Error reading file:", err)
+		return err
+	}
+
+	err = os.WriteFile(destPath, input, 0o644)
+	if err != nil {
+		fmt.Println("Error reading file:", err)
+		return err
+	}
+
+	return nil
 }

 func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
--- a/server/routes.go
+++ b/server/routes.go
@@ -29,7 +29,6 @@ import (
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
-	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )

@@ -789,34 +788,34 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
 }

 func (s *Server) CopyModelHandler(c *gin.Context) {
-	var r api.CopyRequest
-	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
+	var req api.CopyRequest
+	err := c.ShouldBindJSON(&req)
+	switch {
+	case errors.Is(err, io.EOF):
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
 		return
-	} else if err != nil {
+	case err != nil:
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

-	src := model.ParseName(r.Source)
-	if !src.IsValid() {
-		_ = c.Error(fmt.Errorf("source %q is invalid", r.Source))
-	}
-
-	dst := model.ParseName(r.Destination)
-	if !dst.IsValid() {
-		_ = c.Error(fmt.Errorf("destination %q is invalid", r.Destination))
-	}
-
-	if len(c.Errors) > 0 {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": c.Errors.Errors()})
+	if req.Source == "" || req.Destination == "" {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "source add destination are required"})
 		return
 	}

-	if err := CopyModel(src, dst); errors.Is(err, os.ErrNotExist) {
-		c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %q not found", r.Source)})
-	} else if err != nil {
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+	if err := ParseModelPath(req.Destination).Validate(); err != nil {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	if err := CopyModel(req.Source, req.Destination); err != nil {
+		if os.IsNotExist(err) {
+			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Source)})
+		} else {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		}
+		return
 	}
 }

--- a/server/sched.go
+++ b/server/sched.go
@@ -23,6 +23,7 @@ import (
 type LlmRequest struct {
 	ctx             context.Context //nolint:containedctx
 	model           *Model
+	ggml            *llm.GGML // TODO - how large is this, and do we need to free it after we've finished loading?
 	opts            api.Options
 	sessionDuration time.Duration
 	successCh       chan *runnerRef
@@ -38,7 +39,7 @@ type Scheduler struct {
 	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex

-	loadFn      func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
+	loadFn      func(req *LlmRequest, gpus gpu.GpuInfoList)
 	newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
 	getGpuFn    func() gpu.GpuInfoList
 }
@@ -46,7 +47,6 @@ type Scheduler struct {
 // TODO set this to zero after a release or two, to enable multiple models by default
 var loadedMax = 1          // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
 var maxQueuedRequests = 10 // TODO configurable
-var numParallel = 1

 func InitScheduler(ctx context.Context) *Scheduler {
 	maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
@@ -58,14 +58,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
 			loadedMax = m
 		}
 	}
-	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
-		p, err := strconv.Atoi(onp)
-		if err != nil || p <= 0 {
-			slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
-		} else {
-			numParallel = p
-		}
-	}

 	sched := &Scheduler{
 		pendingReqCh:  make(chan *LlmRequest, maxQueuedRequests),
@@ -82,16 +74,20 @@ func InitScheduler(ctx context.Context) *Scheduler {

 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
+	ggml, err := llm.LoadModel(model.ModelPath)
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
+		ggml:            ggml,
 		opts:            opts,
 		sessionDuration: sessionDuration,
 		successCh:       make(chan *runnerRef),
 		errCh:           make(chan error, 1),
 	}
-	// context split across parallel threads
-	opts.NumCtx = opts.NumCtx * numParallel
+	if err != nil {
+		req.errCh <- err
+		return req.successCh, req.errCh
+	}
 	select {
 	case s.pendingReqCh <- req:
 	default:
@@ -134,39 +130,28 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
+				} else if loadedCount == 0 {
+					slog.Debug("loading first model", "model", pending.model.ModelPath)
+					gpus := s.getGpuFn()
+					g := pickBestFitGPUs(pending, gpus)
+					if g != nil {
+						gpus = g
+					}
+					s.loadFn(pending, gpus)
+					break
 				} else if loadedMax > 0 && loadedCount >= loadedMax {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload(pending)
 				} else {
-					// Either no models are loaded or below loadedMax
+					// More than one loaded model, so we have to see if the new one fits
 					// Get a refreshed GPU list
 					gpus := s.getGpuFn()
-
-					// Load model for fitting
-					ggml, err := llm.LoadModel(pending.model.ModelPath)
-					if err != nil {
-						pending.errCh <- err
-						break
-					}
-
-					// No models loaded. Load the model but prefer the best fit.
-					if loadedCount == 0 {
-						slog.Debug("loading first model", "model", pending.model.ModelPath)
-						g := pickBestFitGPUs(pending, ggml, gpus)
-						if g != nil {
-							gpus = g
-						}
-						s.loadFn(pending, ggml, gpus)
-						break
-					}
-
-					// More than one loaded model, so we have to see if the new one fits
 					// Update free memory from currently loaded models
 					s.updateFreeSpace(gpus)
-					gpus = pickBestFitGPUs(pending, ggml, gpus)
+					gpus = pickBestFitGPUs(pending, gpus)
 					if gpus != nil {
 						slog.Debug("new model fits with existing models, loading")
-						s.loadFn(pending, ggml, gpus)
+						s.loadFn(pending, gpus)
 						break
 					}
 					runnerToExpire = s.findRunnerToUnload(pending)
@@ -297,8 +282,8 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 }

-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) {
-	llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
+func (s *Scheduler) load(req *LlmRequest, gpus gpu.GpuInfoList) {
+	llama, err := s.newServerFn(gpus, req.model.ModelPath, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
 	if err != nil {
 		// some older models are not compatible with newer versions of llama.cpp
 		// show a generalized compatibility error until there is a better way to
@@ -432,21 +417,16 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 	slog.Debug("evaluating already loaded", "model", req.model.ModelPath)
 	runner.refMu.Lock()
 	defer runner.refMu.Unlock()
-
+	// Ignore the NumGPU settings for comparison
+	optsExisting := runner.Options.Runner
+	optsExisting.NumGPU = -1
+	optsNew := req.opts.Runner
+	optsNew.NumGPU = -1
 	timeout := 10 * time.Second
 	if runner.loading {
 		timeout = 2 * time.Minute // Initial load can take a long time for big models on slow systems...
 	}
-
-	// Don't reload runner if num_gpu=-1 was provided
-	optsExisting := runner.Options.Runner
-	optsNew := req.opts.Runner
-	if optsNew.NumGPU < 0 {
-		optsExisting.NumGPU = -1
-		optsNew.NumGPU = -1
-	}
-
-	ctx, cancel := context.WithTimeout(ctx, timeout)
+	ctx, cancel := context.WithTimeout(ctx, timeout) // BUG -
 	defer cancel()
 	if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed?
 		!reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed?
@@ -454,7 +434,6 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 		runner.llama.Ping(ctx) != nil {
 		return true
 	}
-
 	return false
 }

@@ -475,7 +454,7 @@ func (a ByDuration) Less(i, j int) bool {

 // pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
 // If the model can not be fit fully within the available GPU(s) nil is returned
-func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList {
+func pickBestFitGPUs(req *LlmRequest, gpus gpu.GpuInfoList) gpu.GpuInfoList {
 	var estimatedVRAM uint64
 	for _, gl := range gpus.ByLibrary() {
 		var ok bool
@@ -487,7 +466,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.

 		// First attempt to fit the model into a single GPU
 		for _, g := range sgl {
-			if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+			if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 				slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 				return []gpu.GpuInfo{g}
 			}
@@ -498,7 +477,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 		// - try subsets of GPUs instead of just falling back to 1 or all in a family

 		// Now try all the GPUs
-		if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+		if ok, estimatedVRAM = llm.PredictServerFit(gl, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 			slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
 			return gl
 		}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -47,7 +47,6 @@ func TestLoad(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
 	defer done()
 	s := InitScheduler(ctx)
-	var ggml *llm.GGML // value not used in tests
 	req := &LlmRequest{
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},
@@ -60,7 +59,7 @@ func TestLoad(t *testing.T) {
 		return nil, fmt.Errorf("something failed to load model blah")
 	}
 	gpus := gpu.GpuInfoList{}
-	s.load(req, ggml, gpus)
+	s.load(req, gpus)
 	require.Len(t, req.successCh, 0)
 	require.Len(t, req.errCh, 1)
 	require.Len(t, s.loaded, 0)
@@ -71,7 +70,7 @@ func TestLoad(t *testing.T) {
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
 		return server, nil
 	}
-	s.load(req, ggml, gpus)
+	s.load(req, gpus)
 	select {
 	case err := <-req.errCh:
 		require.NoError(t, err)
@@ -83,7 +82,7 @@ func TestLoad(t *testing.T) {

 	req.model.ModelPath = "dummy_model_path"
 	server.waitResp = fmt.Errorf("wait failure")
-	s.load(req, ggml, gpus)
+	s.load(req, gpus)
 	select {
 	case err := <-req.errCh:
 		require.Contains(t, err.Error(), "wait failure")
@@ -102,7 +101,6 @@ type bundle struct {
 	ctxDone func()
 	srv     *mockLlm
 	req     *LlmRequest
-	ggml    *llm.GGML
 }

 func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
@@ -134,15 +132,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 	})
 	assert.Nil(t, err)
-
 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
-	scenario.ggml, err = llm.LoadModel(model.ModelPath)
+	ggml, err := llm.LoadModel(model.ModelPath)
 	require.NoError(t, err)
-
 	scenario.req = &LlmRequest{
 		ctx:             scenario.ctx,
 		model:           model,
+		ggml:            ggml,
 		sessionDuration: 5 * time.Millisecond,
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
@@ -160,13 +157,13 @@ func TestRequests(t *testing.T) {
 	scenario1a.req.sessionDuration = 0
 	scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
 	scenario1b.req.model = scenario1a.req.model
-	scenario1b.ggml = scenario1a.ggml
+	scenario1b.req.ggml = scenario1a.req.ggml
 	scenario1b.req.sessionDuration = 0

 	// simple reload of same model
 	scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
 	scenario2a.req.model = scenario1a.req.model
-	scenario2a.ggml = scenario1a.ggml
+	scenario2a.req.ggml = scenario1a.req.ggml

 	// Multiple loaded models
 	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
@@ -325,14 +322,13 @@ func TestGetRunner(t *testing.T) {
 	successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
 	require.Len(t, s.pendingReqCh, 0)
 	require.Len(t, successCh1c, 0)
-	require.Len(t, errCh1c, 0)
-
-	time.Sleep(5 * time.Millisecond)
-	require.Len(t, s.loaded, 0)
 	require.Len(t, errCh1c, 1)
 	err = <-errCh1c
 	require.Contains(t, err.Error(), "bad path")
 	scenario1b.ctxDone()
+
+	time.Sleep(5 * time.Millisecond)
+	require.Len(t, s.loaded, 0)
 }

 // TODO - add one scenario that triggers the bogus finished event with positive ref count
@@ -370,9 +366,7 @@ func TestPrematureExpired(t *testing.T) {
 	require.LessOrEqual(t, len(s.finishedReqCh), 1)
 	time.Sleep(10 * time.Millisecond)
 	require.Len(t, s.finishedReqCh, 0)
-	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 0)
-	s.loadedMu.Unlock()

 	// also shouldn't happen in real life
 	s.finishedReqCh <- scenario1a.req
@@ -432,6 +426,7 @@ func TestUpdateFreeSpace(t *testing.T) {
 	s.updateFreeSpace(gpus)
 	require.Equal(t, uint64(850), gpus[0].FreeMemory)
 	require.Equal(t, uint64(1850), gpus[1].FreeMemory)
+
 }

 func TestFindRunnerToUnload(t *testing.T) {
@@ -490,9 +485,6 @@ func TestNeedsReload(t *testing.T) {
 	require.False(t, resp)
 	req.opts.NumGPU = 99
 	resp = runner.needsReload(ctx, req)
-	require.True(t, resp)
-	req.opts.NumGPU = -1
-	resp = runner.needsReload(ctx, req)
 	require.False(t, resp)
 }

--- a/types/model/name.go
+++ b/types/model/name.go
@@ -686,7 +686,10 @@ func IsValidNamePart(kind PartKind, s string) bool {
 		return false
 	}
 	var consecutiveDots int
-	for _, c := range []byte(s) {
+	for i, c := range []byte(s) {
+		if i == 0 && !isAlphaNumeric(c) {
+			return false
+		}
 		if c == '.' {
 			if consecutiveDots++; consecutiveDots >= 2 {
 				return false
@@ -701,6 +704,10 @@ func IsValidNamePart(kind PartKind, s string) bool {
 	return true
 }

+func isAlphaNumeric(c byte) bool {
+	return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9'
+}
+
 func isValidByteFor(kind PartKind, c byte) bool {
 	if kind == PartNamespace && c == '.' {
 		return false
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -101,6 +101,8 @@ var testNames = map[string]fields{
 	"./../passwd":    {},
 	"./0+..":         {},

+	"-h": {},
+
 	strings.Repeat("a", MaxNamePartLen):   {model: strings.Repeat("a", MaxNamePartLen)},
 	strings.Repeat("a", MaxNamePartLen+1): {},
 }
@@ -117,7 +119,7 @@ func TestIsValidNameLen(t *testing.T) {
 // preventing path traversal.
 func TestNameConsecutiveDots(t *testing.T) {
 	for i := 1; i < 10; i++ {
-		s := strings.Repeat(".", i)
+		s := "a" + strings.Repeat(".", i)
 		if i > 1 {
 			if g := ParseNameFill(s, FillNothing).DisplayLong(); g != "" {
 				t.Errorf("ParseName(%q) = %q; want empty string", s, g)
@@ -339,17 +341,17 @@ func TestDisplayShortest(t *testing.T) {
 		want      string
 		wantPanic bool
 	}{
-		{"example.com/library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
-		{"example.com/library/mistral:latest+Q4_0", "example.com/_/_:latest", "library/mistral", false},
+		{"example.com/library/mistral:latest+Q4_0", "example.com/library/?:latest", "mistral", false},
+		{"example.com/library/mistral:latest+Q4_0", "example.com/?/?:latest", "library/mistral", false},
 		{"example.com/library/mistral:latest+Q4_0", "", "example.com/library/mistral", false},
 		{"example.com/library/mistral:latest+Q4_0", "", "example.com/library/mistral", false},

 		// case-insensitive
-		{"Example.com/library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
-		{"example.com/Library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
-		{"example.com/library/Mistral:latest+Q4_0", "example.com/library/_:latest", "Mistral", false},
-		{"example.com/library/mistral:Latest+Q4_0", "example.com/library/_:latest", "mistral", false},
-		{"example.com/library/mistral:Latest+q4_0", "example.com/library/_:latest", "mistral", false},
+		{"Example.com/library/mistral:latest+Q4_0", "example.com/library/?:latest", "mistral", false},
+		{"example.com/Library/mistral:latest+Q4_0", "example.com/library/?:latest", "mistral", false},
+		{"example.com/library/Mistral:latest+Q4_0", "example.com/library/?:latest", "Mistral", false},
+		{"example.com/library/mistral:Latest+Q4_0", "example.com/library/?:latest", "mistral", false},
+		{"example.com/library/mistral:Latest+q4_0", "example.com/library/?:latest", "mistral", false},

 		// zero value
 		{"", MaskDefault, "", true},
@@ -361,10 +363,10 @@ func TestDisplayShortest(t *testing.T) {
 		{"registry.ollama.ai/library/mistral:latest+Q4_0", MaskDefault, "mistral", false},

 		// Auto-Fill
-		{"x", "example.com/library/_:latest", "x", false},
-		{"x", "example.com/library/_:latest+Q4_0", "x", false},
-		{"x/y:z", "a.com/library/_:latest+Q4_0", "x/y:z", false},
-		{"x/y:z", "a.com/library/_:latest+Q4_0", "x/y:z", false},
+		{"x", "example.com/library/?:latest", "x", false},
+		{"x", "example.com/library/?:latest+Q4_0", "x", false},
+		{"x/y:z", "a.com/library/?:latest+Q4_0", "x/y:z", false},
+		{"x/y:z", "a.com/library/?:latest+Q4_0", "x/y:z", false},
 	}

 	for _, tt := range cases {
@@ -695,10 +697,10 @@ func ExampleName_completeAndResolved() {
 func ExampleName_DisplayShortest() {
 	name := ParseNameFill("example.com/jmorganca/mistral:latest+Q4_0", FillNothing)

-	fmt.Println(name.DisplayShortest("example.com/jmorganca/_:latest"))
-	fmt.Println(name.DisplayShortest("example.com/_/_:latest"))
-	fmt.Println(name.DisplayShortest("example.com/_/_:_"))
-	fmt.Println(name.DisplayShortest("_/_/_:_"))
+	fmt.Println(name.DisplayShortest("example.com/jmorganca/?:latest"))
+	fmt.Println(name.DisplayShortest("example.com/?/?:latest"))
+	fmt.Println(name.DisplayShortest("example.com/?/?:?"))
+	fmt.Println(name.DisplayShortest("?/?/?:?"))

 	// Default
 	name = ParseNameFill("registry.ollama.ai/library/mistral:latest+Q4_0", FillNothing)