feat: improve RAM estimation by using values from summary

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-04 09:55:00 +00:00 · 2025-05-29 22:42:58 +02:00 · 2025-05-29 22:42:58 +02:00 · 4909aa6750
commit 4909aa6750
parent 0870bf5af6
1 changed files with 21 additions and 13 deletions
--- a/pkg/xsysinfo/gguf.go
+++ b/pkg/xsysinfo/gguf.go
@ -1,8 +1,6 @@
 package xsysinfo

 import (
-	"errors"
-
 	gguf "github.com/gpustack/gguf-parser-go"
 )

@ -18,28 +16,38 @@ type VRAMEstimate struct {
 func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
 	// Get model metadata
 	m := f.Metadata()
-	a := f.Architecture()
+
+	estimate := f.EstimateLLaMACppRun()
+
+	lmes := estimate.SummarizeItem(true, 0, 0)
+	estimatedVRAM := uint64(0)
+	availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here
+
+	for _, vram := range lmes.VRAMs {
+		estimatedVRAM += uint64(vram.NonUMA)
+	}

 	// Calculate base model size
 	modelSize := uint64(m.Size)

-	if a.BlockCount == 0 {
-		return nil, errors.New("block count is 0")
+	if availableLayers == 0 {
+		availableLayers = 1
+	}
+
+	if estimatedVRAM == 0 {
+		estimatedVRAM = 1
 	}

 	// Estimate number of layers that can fit in VRAM
 	// Each layer typically requires about 1/32 of the model size
-	layerSize := modelSize / uint64(a.BlockCount)
-	estimatedLayers := int(availableVRAM / layerSize)
+	layerSize := estimatedVRAM / availableLayers

-	// If we can't fit even one layer, we need to do full offload
-	isFullOffload := estimatedLayers <= 0
-	if isFullOffload {
-		estimatedLayers = 0
+	estimatedLayers := int(availableVRAM / layerSize)
+	if availableVRAM > estimatedVRAM {
+		estimatedLayers = int(availableLayers)
 	}

 	// Calculate estimated VRAM usage
-	estimatedVRAM := uint64(estimatedLayers) * layerSize

 	return &VRAMEstimate{
 		TotalVRAM:       availableVRAM,
@ -47,6 +55,6 @@ func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimat
 		ModelSize:       modelSize,
 		EstimatedLayers: estimatedLayers,
 		EstimatedVRAM:   estimatedVRAM,
-		IsFullOffload:   isFullOffload,
+		IsFullOffload:   availableVRAM > estimatedVRAM,
 	}, nil
 }