Compare commits
2 Commits
mxyng/fix-
...
mxyng/api-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2fe945412a | ||
|
|
de4fc29773 |
@@ -83,7 +83,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
var memoryLayerOutput uint64
|
||||
|
||||
// The sizes of a layer
|
||||
var baseLayerSize uint64
|
||||
var layerSize uint64
|
||||
|
||||
// The sum of all the layer sizes (just for logging)
|
||||
var memoryWeights uint64
|
||||
@@ -110,27 +110,27 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
layers := ggml.Tensors().Layers()
|
||||
// add one layer worth of memory as a buffer
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
baseLayerSize = blk0.size()
|
||||
layerSize = blk0.size()
|
||||
} else {
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
|
||||
kv := 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
|
||||
layerKV := kv / ggml.KV().BlockCount()
|
||||
baseLayerSize += layerKV
|
||||
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
layerSize += kv / ggml.KV().BlockCount()
|
||||
|
||||
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||
if graphPartialOffload == 0 {
|
||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||
}
|
||||
|
||||
if graphFullOffload == 0 {
|
||||
graphFullOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
// on metal there's no partial offload overhead
|
||||
if gpus[0].Library == "metal" {
|
||||
// there's no partial offload overhead on metal
|
||||
graphPartialOffload = graphFullOffload
|
||||
} else if len(gpus) > 1 {
|
||||
// multigpu should always use the partial graph size
|
||||
@@ -140,7 +140,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if layer, ok := layers["output"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
} else if layer, ok := layers["token_embd"]; ok {
|
||||
@@ -165,12 +164,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
gzo = gpuZeroOverhead
|
||||
}
|
||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*baseLayerSize {
|
||||
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
||||
continue
|
||||
}
|
||||
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||
gpuAllocations[i] += gpus[i].MinimumMemory + baseLayerSize // We hold off on graph until we know partial vs. full
|
||||
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
|
||||
}
|
||||
|
||||
var gpuZeroID int
|
||||
@@ -181,14 +180,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
|
||||
// For all the layers, find where they can fit on the GPU(s)
|
||||
for i := range int(ggml.KV().BlockCount()) {
|
||||
var layerSize uint64
|
||||
// Some models have inconsistent layer sizes
|
||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||
layerSize = blk.size()
|
||||
} else {
|
||||
slog.Error("missing layer", "blk", i)
|
||||
continue
|
||||
layerSize += kv / ggml.KV().BlockCount()
|
||||
}
|
||||
|
||||
memoryWeights += layerSize
|
||||
|
||||
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||
@@ -200,8 +196,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
for j := len(gpusWithSpace); j > 0; j-- {
|
||||
g := gpusWithSpace[i%j]
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > used+layerSize+layerKV {
|
||||
gpuAllocations[g.i] += layerSize + layerKV
|
||||
if g.g.FreeMemory > used+layerSize {
|
||||
gpuAllocations[g.i] += layerSize
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
break
|
||||
@@ -210,12 +206,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if layerCount >= int(ggml.KV().BlockCount()) {
|
||||
fullyLoaded = true
|
||||
} else {
|
||||
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
|
||||
overflow += baseLayerSize
|
||||
overflow += layerSize
|
||||
}
|
||||
}
|
||||
|
||||
@@ -270,10 +265,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
}
|
||||
tensorSplit = strings.Join(splits, ",")
|
||||
}
|
||||
|
||||
allocationsList := make([]string, len(gpuAllocations))
|
||||
for i, a := range gpuAllocations {
|
||||
allocationsList[i] = format.HumanBytes2(a)
|
||||
allocationsList := []string{}
|
||||
for _, a := range gpuAllocations {
|
||||
allocationsList = append(allocationsList, format.HumanBytes2(a))
|
||||
}
|
||||
|
||||
estimate := MemoryEstimate{
|
||||
@@ -343,9 +337,9 @@ func (m MemoryEstimate) log() {
|
||||
slog.Group(
|
||||
"weights",
|
||||
// memory of the weights
|
||||
"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
|
||||
"total", format.HumanBytes2(m.memoryWeights),
|
||||
// memory of repeating layers
|
||||
"repeating", format.HumanBytes2(m.memoryWeights),
|
||||
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
|
||||
// memory of non-repeating layers
|
||||
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
||||
),
|
||||
|
||||
@@ -62,15 +62,6 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
assert.Equal(t, 0, estimate.Layers)
|
||||
assert.Equal(t, uint64(0), estimate.Graph)
|
||||
|
||||
// 5 layers * 4 bytes per layer
|
||||
if estimate.memoryWeights != 20 {
|
||||
t.Errorf("expected memoryWeights 20, got %d", estimate.memoryWeights)
|
||||
}
|
||||
|
||||
if estimate.memoryLayerOutput != 4 {
|
||||
t.Errorf("expected memoryLayerOutput 4, got %d", estimate.memoryLayerOutput)
|
||||
}
|
||||
})
|
||||
|
||||
// derived from the dummy ggml file above
|
||||
@@ -133,15 +124,6 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
||||
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
||||
}
|
||||
|
||||
// 5 layers * 4 bytes per layer
|
||||
if estimate.memoryWeights != 20 {
|
||||
t.Errorf("expected memoryWeights 20, got %d", estimate.memoryWeights)
|
||||
}
|
||||
|
||||
if estimate.memoryLayerOutput != 4 {
|
||||
t.Errorf("expected memoryLayerOutput 4, got %d", estimate.memoryLayerOutput)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,11 +44,12 @@ type LlamaServer interface {
|
||||
|
||||
// llmServer is an instance of the llama.cpp server
|
||||
type llmServer struct {
|
||||
port int
|
||||
cmd *exec.Cmd
|
||||
done chan error // Channel to signal when the process exits
|
||||
status *StatusWriter
|
||||
options api.Options
|
||||
port int
|
||||
cmd *exec.Cmd
|
||||
done chan error // Channel to signal when the process exits
|
||||
status *StatusWriter
|
||||
options api.Options
|
||||
numParallel int
|
||||
|
||||
estimate MemoryEstimate
|
||||
totalLayers uint64
|
||||
@@ -343,6 +344,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
estimate: estimate,
|
||||
numParallel: numParallel,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: ggml.KV().BlockCount() + 1,
|
||||
gpus: gpus,
|
||||
@@ -890,11 +892,14 @@ type EmbedResponse struct {
|
||||
}
|
||||
|
||||
func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
// each input will use a slot, so we need to acquire the semaphore for
|
||||
// the number of inputs up to numParallel
|
||||
slots := int64(min(len(input), s.numParallel))
|
||||
if err := s.sem.Acquire(ctx, slots); err != nil {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
defer s.sem.Release(slots)
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
|
||||
@@ -19,6 +19,7 @@ type Manifest struct {
|
||||
Config *Layer `json:"config"`
|
||||
Layers []*Layer `json:"layers"`
|
||||
|
||||
name model.Name
|
||||
filepath string
|
||||
fi os.FileInfo
|
||||
digest string
|
||||
@@ -69,7 +70,6 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) {
|
||||
|
||||
p := filepath.Join(manifests, n.Filepath())
|
||||
|
||||
var m Manifest
|
||||
f, err := os.Open(p)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -81,11 +81,13 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var m Manifest
|
||||
sha256sum := sha256.New()
|
||||
if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&m); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
m.name = n
|
||||
m.filepath = p
|
||||
m.fi = fi
|
||||
m.digest = hex.EncodeToString(sha256sum.Sum(nil))
|
||||
|
||||
150
server/routes.go
150
server/routes.go
@@ -703,6 +703,153 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func manifestLayers(m *Manifest, exclude []string) (map[string]any, error) {
|
||||
r := map[string]any{
|
||||
"name": m.name.DisplayShortest(),
|
||||
"digest": m.digest,
|
||||
"size": m.Size(),
|
||||
"modified_at": m.fi.ModTime(),
|
||||
}
|
||||
|
||||
excludeAll := slices.Contains(exclude, "all")
|
||||
excludeDetails := slices.Contains(exclude, "details")
|
||||
|
||||
for _, layer := range m.Layers {
|
||||
var errExcludeKey = errors.New("exclude key")
|
||||
key, content, err := func() (string, any, error) {
|
||||
key := strings.TrimPrefix(layer.MediaType, "application/vnd.ollama.image.")
|
||||
if slices.Contains(exclude, key) || excludeAll {
|
||||
return "", nil, errExcludeKey
|
||||
}
|
||||
|
||||
f, err := layer.Open()
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
switch key {
|
||||
case "model", "projector", "adapter":
|
||||
ggml, _, err := llm.DecodeGGML(f, 0)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
content := map[string]any{
|
||||
"architecture": ggml.KV().Architecture(),
|
||||
"file_type": ggml.KV().FileType().String(),
|
||||
"parameter_count": ggml.KV().ParameterCount(),
|
||||
}
|
||||
|
||||
if !slices.Contains(exclude, key+".details") && !excludeAll && !excludeDetails {
|
||||
// exclude any extraneous or redundant fields
|
||||
delete(ggml.KV(), "general.basename")
|
||||
delete(ggml.KV(), "general.description")
|
||||
delete(ggml.KV(), "general.filename")
|
||||
delete(ggml.KV(), "general.finetune")
|
||||
delete(ggml.KV(), "general.languages")
|
||||
delete(ggml.KV(), "general.license")
|
||||
delete(ggml.KV(), "general.license.link")
|
||||
delete(ggml.KV(), "general.name")
|
||||
delete(ggml.KV(), "general.paramter_count")
|
||||
delete(ggml.KV(), "general.size_label")
|
||||
delete(ggml.KV(), "general.tags")
|
||||
delete(ggml.KV(), "general.type")
|
||||
delete(ggml.KV(), "general.quantization_version")
|
||||
delete(ggml.KV(), "tokenizer.chat_template")
|
||||
content["details"] = ggml.KV()
|
||||
}
|
||||
|
||||
return key, content, nil
|
||||
case "params", "messages":
|
||||
var content any
|
||||
if err := json.NewDecoder(f).Decode(&content); err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
return key, content, nil
|
||||
case "template", "system", "license":
|
||||
bts, err := io.ReadAll(f)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
if key == "license" {
|
||||
return key, []any{string(bts)}, nil
|
||||
}
|
||||
|
||||
return key, string(bts), nil
|
||||
}
|
||||
|
||||
return layer.MediaType, nil, nil
|
||||
}()
|
||||
if errors.Is(err, errExcludeKey) {
|
||||
continue
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if s, ok := r[key].([]any); ok {
|
||||
r[key] = append(s, content)
|
||||
} else {
|
||||
r[key] = content
|
||||
}
|
||||
}
|
||||
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func (s *Server) GetModelsHandler(c *gin.Context) {
|
||||
ms, err := Manifests()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
var rs []map[string]any
|
||||
for _, m := range ms {
|
||||
r, err := manifestLayers(m, c.QueryArray("exclude"))
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
rs = append(rs, r)
|
||||
}
|
||||
|
||||
slices.SortStableFunc(rs, func(i, j map[string]any) int {
|
||||
// most recently modified first
|
||||
return cmp.Compare(
|
||||
j["modified_at"].(time.Time).Unix(),
|
||||
i["modified_at"].(time.Time).Unix(),
|
||||
)
|
||||
})
|
||||
|
||||
c.JSON(http.StatusOK, rs)
|
||||
}
|
||||
|
||||
func (s *Server) GetModelHandler(c *gin.Context) {
|
||||
n := model.ParseName(strings.TrimPrefix(c.Param("model"), "/"))
|
||||
if !n.IsValid() {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid model name"})
|
||||
return
|
||||
}
|
||||
|
||||
m, err := ParseNamedManifest(n)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
r, err := manifestLayers(m, c.QueryArray("exclude"))
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, r)
|
||||
}
|
||||
|
||||
func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
m, err := GetModel(req.Model)
|
||||
if err != nil {
|
||||
@@ -1090,6 +1237,9 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
})
|
||||
|
||||
r.Handle(method, "/api/models", s.GetModelsHandler)
|
||||
r.Handle(method, "/api/models/*model", s.GetModelHandler)
|
||||
|
||||
r.Handle(method, "/api/tags", s.ListModelsHandler)
|
||||
r.Handle(method, "/api/version", func(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, gin.H{"version": version.Version})
|
||||
|
||||
Reference in New Issue
Block a user