mirror of
https://github.com/mudler/LocalAI.git
synced 2025-06-06 02:45:00 +00:00
feat: improve RAM estimation by using values from summary
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
0870bf5af6
commit
4909aa6750
1 changed files with 21 additions and 13 deletions
|
@ -1,8 +1,6 @@
|
||||||
package xsysinfo
|
package xsysinfo
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
|
||||||
|
|
||||||
gguf "github.com/gpustack/gguf-parser-go"
|
gguf "github.com/gpustack/gguf-parser-go"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -18,28 +16,38 @@ type VRAMEstimate struct {
|
||||||
func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
|
func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
|
||||||
// Get model metadata
|
// Get model metadata
|
||||||
m := f.Metadata()
|
m := f.Metadata()
|
||||||
a := f.Architecture()
|
|
||||||
|
estimate := f.EstimateLLaMACppRun()
|
||||||
|
|
||||||
|
lmes := estimate.SummarizeItem(true, 0, 0)
|
||||||
|
estimatedVRAM := uint64(0)
|
||||||
|
availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here
|
||||||
|
|
||||||
|
for _, vram := range lmes.VRAMs {
|
||||||
|
estimatedVRAM += uint64(vram.NonUMA)
|
||||||
|
}
|
||||||
|
|
||||||
// Calculate base model size
|
// Calculate base model size
|
||||||
modelSize := uint64(m.Size)
|
modelSize := uint64(m.Size)
|
||||||
|
|
||||||
if a.BlockCount == 0 {
|
if availableLayers == 0 {
|
||||||
return nil, errors.New("block count is 0")
|
availableLayers = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if estimatedVRAM == 0 {
|
||||||
|
estimatedVRAM = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
// Estimate number of layers that can fit in VRAM
|
// Estimate number of layers that can fit in VRAM
|
||||||
// Each layer typically requires about 1/32 of the model size
|
// Each layer typically requires about 1/32 of the model size
|
||||||
layerSize := modelSize / uint64(a.BlockCount)
|
layerSize := estimatedVRAM / availableLayers
|
||||||
estimatedLayers := int(availableVRAM / layerSize)
|
|
||||||
|
|
||||||
// If we can't fit even one layer, we need to do full offload
|
estimatedLayers := int(availableVRAM / layerSize)
|
||||||
isFullOffload := estimatedLayers <= 0
|
if availableVRAM > estimatedVRAM {
|
||||||
if isFullOffload {
|
estimatedLayers = int(availableLayers)
|
||||||
estimatedLayers = 0
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate estimated VRAM usage
|
// Calculate estimated VRAM usage
|
||||||
estimatedVRAM := uint64(estimatedLayers) * layerSize
|
|
||||||
|
|
||||||
return &VRAMEstimate{
|
return &VRAMEstimate{
|
||||||
TotalVRAM: availableVRAM,
|
TotalVRAM: availableVRAM,
|
||||||
|
@ -47,6 +55,6 @@ func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimat
|
||||||
ModelSize: modelSize,
|
ModelSize: modelSize,
|
||||||
EstimatedLayers: estimatedLayers,
|
EstimatedLayers: estimatedLayers,
|
||||||
EstimatedVRAM: estimatedVRAM,
|
EstimatedVRAM: estimatedVRAM,
|
||||||
IsFullOffload: isFullOffload,
|
IsFullOffload: availableVRAM > estimatedVRAM,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue