feat(llama.cpp): estimate vram usage (#5299)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-20 02:24:59 +00:00 · 2025-05-02 17:40:26 +02:00 · 2025-05-02 17:40:26 +02:00 · 5c6cd50ed6
commit 5c6cd50ed6
parent bace6516f1
7 changed files with 131 additions and 21 deletions
--- a/core/cli/util.go
+++ b/core/cli/util.go
@ -7,11 +7,11 @@ import (

 	"github.com/rs/zerolog/log"

+	gguf "github.com/gpustack/gguf-parser-go"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/pkg/downloader"
-	gguf "github.com/thxcode/gguf-parser-go"
 )

 type UtilCMD struct {
@ -51,7 +51,7 @@ func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
 	log.Info().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
+		Any("modelName", f.Metadata().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])

 	log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@ -3,9 +3,10 @@ package config
 import (
 	"strings"

+	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"

-	gguf "github.com/thxcode/gguf-parser-go"
+	gguf "github.com/gpustack/gguf-parser-go"
 )

 type familyType uint8
@ -23,6 +24,7 @@ const (

 const (
 	defaultContextSize = 1024
+	defaultNGPULayers  = 99999999
 )

 type settingsConfig struct {
@ -147,7 +149,7 @@ var knownTemplates = map[string]familyType{
 func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {

 	if defaultCtx == 0 && cfg.ContextSize == nil {
-		ctxSize := f.EstimateLLaMACppUsage().ContextSize
+		ctxSize := f.EstimateLLaMACppRun().ContextSize
 		if ctxSize > 0 {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
@ -157,6 +159,46 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
 		}
 	}

+	// GPU options
+	if cfg.Options == nil {
+		if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
+			cfg.Options = []string{"gpu"}
+		}
+	}
+
+	// vram estimation
+	vram, err := xsysinfo.TotalAvailableVRAM()
+	if err != nil {
+		log.Error().Msgf("guessDefaultsFromFile(TotalAvailableVRAM): %s", err)
+	} else {
+		estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
+		if err != nil {
+			log.Error().Msgf("guessDefaultsFromFile(EstimateGGUFVRAMUsage): %s", err)
+		} else {
+			if estimate.IsFullOffload {
+				log.Warn().Msgf("guessDefaultsFromFile: %s", "full offload is recommended")
+			}
+
+			if estimate.EstimatedVRAM > vram {
+				log.Warn().Msgf("guessDefaultsFromFile: %s", "estimated VRAM usage is greater than available VRAM")
+			}
+
+			if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
+				log.Debug().Msgf("guessDefaultsFromFile: %d layers estimated", estimate.EstimatedLayers)
+				cfg.NGPULayers = &estimate.EstimatedLayers
+			}
+		}
+	}
+
+	if cfg.NGPULayers == nil {
+		// we assume we want to offload all layers
+		defaultHigh := defaultNGPULayers
+		cfg.NGPULayers = &defaultHigh
+	}
+
+	log.Debug().Any("NGPULayers", cfg.NGPULayers).Msgf("guessDefaultsFromFile: %s", "NGPULayers set")
+
+	// template estimations
 	if cfg.HasTemplate() {
 		// nothing to guess here
 		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
@ -166,12 +208,12 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
 	log.Debug().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
+		Any("modelName", f.Metadata().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())

 	// guess the name
 	if cfg.Name == "" {
-		cfg.Name = f.Model().Name
+		cfg.Name = f.Metadata().Name
 	}

 	family := identifyFamily(f)
@ -207,6 +249,7 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
 		cfg.TemplateConfig.JinjaTemplate = true
 		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
 	}
+
 }

 func identifyFamily(f *gguf.GGUFFile) familyType {
@ -231,7 +274,7 @@ func identifyFamily(f *gguf.GGUFFile) familyType {
 	commandR := arch == "command-r" && eosTokenID == 255001
 	qwen2 := arch == "qwen2"
 	phi3 := arch == "phi-3"
-	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
+	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
 	deepseek2 := arch == "deepseek2"

 	switch {
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@ -4,9 +4,8 @@ import (
 	"os"
 	"path/filepath"

-	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	gguf "github.com/gpustack/gguf-parser-go"
 	"github.com/rs/zerolog/log"
-	gguf "github.com/thxcode/gguf-parser-go"
 )

 func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
@ -36,10 +35,4 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int)
 		}
 		cfg.ContextSize = &defaultCtx
 	}
-
-	if cfg.Options == nil {
-		if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
-			cfg.Options = []string{"gpu"}
-		}
-	}
 }