Merge remote-tracking branch 'upstream/main' into vulkanV3
This commit is contained in:
commit
e6da524ab7
|
|
@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool {
|
|||
return ok
|
||||
}
|
||||
|
||||
// AsBool returns the value as a bool (true if enabled in any way)
|
||||
func (t *ThinkValue) AsBool() bool {
|
||||
// Bool returns the value as a bool (true if enabled in any way)
|
||||
func (t *ThinkValue) Bool() bool {
|
||||
if t == nil || t.Value == nil {
|
||||
return false
|
||||
}
|
||||
|
|
@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool {
|
|||
}
|
||||
}
|
||||
|
||||
// AsString returns the value as a string
|
||||
func (t *ThinkValue) AsString() string {
|
||||
// String returns the value as a string
|
||||
func (t *ThinkValue) String() string {
|
||||
if t == nil || t.Value == nil {
|
||||
return ""
|
||||
}
|
||||
|
|
|
|||
|
|
@ -179,7 +179,12 @@ func (si SystemInfo) GetOptimalThreadCount() int {
|
|||
// For each GPU, check if it does NOT support flash attention
|
||||
func (l GpuInfoList) FlashAttentionSupported() bool {
|
||||
for _, gpu := range l {
|
||||
if !gpu.FlashAttention {
|
||||
supportsFA := gpu.Library == "cpu" ||
|
||||
gpu.Library == "metal" ||
|
||||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
|
||||
gpu.Library == "rocm"
|
||||
|
||||
if !supportsFA {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -103,6 +103,7 @@ type ChatCompletionRequest struct {
|
|||
ResponseFormat *ResponseFormat `json:"response_format"`
|
||||
Tools []api.Tool `json:"tools"`
|
||||
Reasoning *Reasoning `json:"reasoning,omitempty"`
|
||||
ReasoningEffort *string `json:"reasoning_effort,omitempty"`
|
||||
}
|
||||
|
||||
type ChatCompletion struct {
|
||||
|
|
@ -541,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
|||
options["top_p"] = 1.0
|
||||
}
|
||||
|
||||
if r.Reasoning != nil {
|
||||
options["reasoning"] = *r.Reasoning.Effort
|
||||
}
|
||||
|
||||
var format json.RawMessage
|
||||
if r.ResponseFormat != nil {
|
||||
switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
|
||||
|
|
@ -560,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
|||
|
||||
var think *api.ThinkValue
|
||||
if r.Reasoning != nil {
|
||||
options["reasoning"] = *r.Reasoning.Effort
|
||||
think = &api.ThinkValue{
|
||||
Value: *r.Reasoning.Effort,
|
||||
}
|
||||
} else if r.ReasoningEffort != nil {
|
||||
options["reasoning"] = *r.ReasoningEffort
|
||||
think = &api.ThinkValue{
|
||||
Value: *r.ReasoningEffort,
|
||||
}
|
||||
}
|
||||
|
||||
return &api.ChatRequest{
|
||||
|
|
|
|||
|
|
@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||
thinkVal := false
|
||||
thinkLevel := ""
|
||||
if think != nil {
|
||||
thinkVal = think.AsBool()
|
||||
thinkLevel = think.AsString()
|
||||
thinkVal = think.Bool()
|
||||
thinkLevel = think.String()
|
||||
}
|
||||
var b bytes.Buffer
|
||||
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
|
||||
|
|
@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||
thinkVal := false
|
||||
thinkLevel := ""
|
||||
if think != nil {
|
||||
thinkVal = think.AsBool()
|
||||
thinkLevel = think.AsString()
|
||||
thinkVal = think.Bool()
|
||||
thinkLevel = think.String()
|
||||
}
|
||||
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
|
||||
return "", nil, err
|
||||
|
|
|
|||
|
|
@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
|
||||
// Validate Think value: string values currently only allowed for gptoss models
|
||||
if req.Think != nil && req.Think.IsString() && !useHarmony {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -213,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
if req.Suffix != "" {
|
||||
caps = append(caps, model.CapabilityInsert)
|
||||
}
|
||||
if req.Think != nil && req.Think.AsBool() {
|
||||
if req.Think != nil && req.Think.Bool() {
|
||||
caps = append(caps, model.CapabilityThinking)
|
||||
// TODO(drifkin): consider adding a warning if it's false and the model
|
||||
// doesn't support thinking. It's not strictly required, but it can be a
|
||||
|
|
@ -288,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
|
||||
}
|
||||
|
||||
values.Think = req.Think != nil && req.Think.AsBool()
|
||||
values.Think = req.Think != nil && req.Think.Bool()
|
||||
values.ThinkLevel = ""
|
||||
if req.Think != nil {
|
||||
values.ThinkLevel = req.Think.AsString()
|
||||
values.ThinkLevel = req.Think.String()
|
||||
}
|
||||
values.IsThinkSet = req.Think != nil
|
||||
|
||||
|
|
@ -317,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
var thinkingState *thinking.Parser
|
||||
if !useHarmony {
|
||||
openingTag, closingTag := thinking.InferTags(m.Template.Template)
|
||||
if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
|
||||
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
|
||||
thinkingState = &thinking.Parser{
|
||||
OpeningTag: openingTag,
|
||||
ClosingTag: closingTag,
|
||||
|
|
@ -371,7 +371,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
*toolName = strings.TrimPrefix(*toolName, "functions.")
|
||||
var args api.ToolCallFunctionArguments
|
||||
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
|
||||
ch <- gin.H{"error parsing tool call": err.Error()}
|
||||
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
|
||||
ch <- gin.H{"error": errStr}
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -1546,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||
if len(req.Tools) > 0 {
|
||||
caps = append(caps, model.CapabilityTools)
|
||||
}
|
||||
if req.Think != nil && req.Think.AsBool() {
|
||||
if req.Think != nil && req.Think.Bool() {
|
||||
caps = append(caps, model.CapabilityThinking)
|
||||
}
|
||||
|
||||
|
|
@ -1600,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||
|
||||
// Validate Think value: string values currently only allowed for gptoss models
|
||||
if req.Think != nil && req.Think.IsString() && !useHarmony {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -1619,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||
|
||||
var thinkingState *thinking.Parser
|
||||
openingTag, closingTag := thinking.InferTags(m.Template.Template)
|
||||
if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
|
||||
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
|
||||
thinkingState = &thinking.Parser{
|
||||
OpeningTag: openingTag,
|
||||
ClosingTag: closingTag,
|
||||
|
|
@ -1671,7 +1672,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||
*toolName = strings.TrimPrefix(*toolName, "functions.")
|
||||
var args api.ToolCallFunctionArguments
|
||||
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
|
||||
ch <- gin.H{"error parsing tool call": err.Error()}
|
||||
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
|
||||
ch <- gin.H{"error": errStr}
|
||||
return
|
||||
}
|
||||
res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}}
|
||||
|
|
|
|||
|
|
@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool {
|
|||
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
|
||||
// opts.NumCtx accordingly
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
var estimatedVRAM uint64
|
||||
|
||||
var numParallelToTry []int
|
||||
if *numParallel <= 0 {
|
||||
// If no specific parallel setting was provided, try larger then smaller, always end with 1
|
||||
|
|
@ -769,39 +767,51 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
|
|||
}
|
||||
|
||||
for _, gl := range gpus.ByLibrary() {
|
||||
var ok bool
|
||||
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
||||
|
||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
||||
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
||||
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
|
||||
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
||||
|
||||
// First attempt to fit the model into a single GPU
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if !envconfig.SchedSpread() {
|
||||
for _, g := range sgl {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||
if !envconfig.SchedSpread() {
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
// Try to pack into as few GPUs as possible, starting from 1 GPU
|
||||
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
|
||||
gpuSubset := sgl[:numGPUs]
|
||||
ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
|
||||
|
||||
if ok {
|
||||
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
|
||||
"model", req.model.ModelPath,
|
||||
"library", sgl[0].Library,
|
||||
"parallel", p,
|
||||
"required", format.HumanBytes2(estimatedVRAM),
|
||||
"gpus", numGPUs)
|
||||
*numParallel = p
|
||||
return []discover.GpuInfo{g}
|
||||
return gpuSubset
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// TODO future refinements
|
||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// TODO future refinements
|
||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// Now try all the GPUs
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
||||
*numParallel = p
|
||||
return sgl
|
||||
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading",
|
||||
"model", req.model.ModelPath,
|
||||
"library", sgl[0].Library,
|
||||
"parallel", p,
|
||||
"required", format.HumanBytes2(estimatedVRAM),
|
||||
"gpus", len(sgl))
|
||||
*numParallel = p
|
||||
return sgl
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue