Merge remote-tracking branch 'upstream/main' into vulkanV3

This commit is contained in:
Inforithmics 2025-08-12 21:51:39 +02:00
commit e6da524ab7
6 changed files with 67 additions and 47 deletions

View File

@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool {
return ok
}
// AsBool returns the value as a bool (true if enabled in any way)
func (t *ThinkValue) AsBool() bool {
// Bool returns the value as a bool (true if enabled in any way)
func (t *ThinkValue) Bool() bool {
if t == nil || t.Value == nil {
return false
}
@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool {
}
}
// AsString returns the value as a string
func (t *ThinkValue) AsString() string {
// String returns the value as a string
func (t *ThinkValue) String() string {
if t == nil || t.Value == nil {
return ""
}

View File

@ -179,7 +179,12 @@ func (si SystemInfo) GetOptimalThreadCount() int {
// For each GPU, check if it does NOT support flash attention
func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l {
if !gpu.FlashAttention {
supportsFA := gpu.Library == "cpu" ||
gpu.Library == "metal" ||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
gpu.Library == "rocm"
if !supportsFA {
return false
}
}

View File

@ -103,6 +103,7 @@ type ChatCompletionRequest struct {
ResponseFormat *ResponseFormat `json:"response_format"`
Tools []api.Tool `json:"tools"`
Reasoning *Reasoning `json:"reasoning,omitempty"`
ReasoningEffort *string `json:"reasoning_effort,omitempty"`
}
type ChatCompletion struct {
@ -541,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
options["top_p"] = 1.0
}
if r.Reasoning != nil {
options["reasoning"] = *r.Reasoning.Effort
}
var format json.RawMessage
if r.ResponseFormat != nil {
switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
@ -560,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
var think *api.ThinkValue
if r.Reasoning != nil {
options["reasoning"] = *r.Reasoning.Effort
think = &api.ThinkValue{
Value: *r.Reasoning.Effort,
}
} else if r.ReasoningEffort != nil {
options["reasoning"] = *r.ReasoningEffort
think = &api.ThinkValue{
Value: *r.ReasoningEffort,
}
}
return &api.ChatRequest{

View File

@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
thinkVal := false
thinkLevel := ""
if think != nil {
thinkVal = think.AsBool()
thinkLevel = think.AsString()
thinkVal = think.Bool()
thinkLevel = think.String()
}
var b bytes.Buffer
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
thinkVal := false
thinkLevel := ""
if think != nil {
thinkVal = think.AsBool()
thinkLevel = think.AsString()
thinkVal = think.Bool()
thinkLevel = think.String()
}
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
return "", nil, err

View File

@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
// Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
return
}
@ -213,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
if req.Suffix != "" {
caps = append(caps, model.CapabilityInsert)
}
if req.Think != nil && req.Think.AsBool() {
if req.Think != nil && req.Think.Bool() {
caps = append(caps, model.CapabilityThinking)
// TODO(drifkin): consider adding a warning if it's false and the model
// doesn't support thinking. It's not strictly required, but it can be a
@ -288,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
}
values.Think = req.Think != nil && req.Think.AsBool()
values.Think = req.Think != nil && req.Think.Bool()
values.ThinkLevel = ""
if req.Think != nil {
values.ThinkLevel = req.Think.AsString()
values.ThinkLevel = req.Think.String()
}
values.IsThinkSet = req.Think != nil
@ -317,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
var thinkingState *thinking.Parser
if !useHarmony {
openingTag, closingTag := thinking.InferTags(m.Template.Template)
if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
thinkingState = &thinking.Parser{
OpeningTag: openingTag,
ClosingTag: closingTag,
@ -371,7 +371,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
*toolName = strings.TrimPrefix(*toolName, "functions.")
var args api.ToolCallFunctionArguments
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
ch <- gin.H{"error parsing tool call": err.Error()}
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
ch <- gin.H{"error": errStr}
return
}
@ -1546,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
if len(req.Tools) > 0 {
caps = append(caps, model.CapabilityTools)
}
if req.Think != nil && req.Think.AsBool() {
if req.Think != nil && req.Think.Bool() {
caps = append(caps, model.CapabilityThinking)
}
@ -1600,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
// Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
return
}
@ -1619,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
var thinkingState *thinking.Parser
openingTag, closingTag := thinking.InferTags(m.Template.Template)
if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
thinkingState = &thinking.Parser{
OpeningTag: openingTag,
ClosingTag: closingTag,
@ -1671,7 +1672,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
*toolName = strings.TrimPrefix(*toolName, "functions.")
var args api.ToolCallFunctionArguments
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
ch <- gin.H{"error parsing tool call": err.Error()}
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
ch <- gin.H{"error": errStr}
return
}
res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}}

View File

@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool {
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
// opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var estimatedVRAM uint64
var numParallelToTry []int
if *numParallel <= 0 {
// If no specific parallel setting was provided, try larger then smaller, always end with 1
@ -769,39 +767,51 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
}
for _, gl := range gpus.ByLibrary() {
var ok bool
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
// First attempt to fit the model into a single GPU
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread() {
for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
if !envconfig.SchedSpread() {
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", numGPUs)
*numParallel = p
return []discover.GpuInfo{g}
return gpuSubset
}
}
}
}
} else {
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUs
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p
return sgl
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", len(sgl))
*numParallel = p
return sgl
}
}
}
}