LocalAI/pkg/xsysinfo/gguf.go

package xsysinfo

import (
	gguf "github.com/gpustack/gguf-parser-go"
)

type VRAMEstimate struct {
	TotalVRAM       uint64
	AvailableVRAM   uint64
	ModelSize       uint64
	EstimatedLayers int
	EstimatedVRAM   uint64
	IsFullOffload   bool
}

func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
	// Get model metadata
	m := f.Metadata()

	estimate := f.EstimateLLaMACppRun()

	lmes := estimate.SummarizeItem(true, 0, 0)
	estimatedVRAM := uint64(0)
	availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here

	for _, vram := range lmes.VRAMs {
		estimatedVRAM += uint64(vram.NonUMA)
	}

	// Calculate base model size
	modelSize := uint64(m.Size)

	if availableLayers == 0 {
		availableLayers = 1
	}

	if estimatedVRAM == 0 {
		estimatedVRAM = 1
	}

	// Estimate number of layers that can fit in VRAM
	// Each layer typically requires about 1/32 of the model size
	layerSize := estimatedVRAM / availableLayers

	estimatedLayers := int(availableVRAM / layerSize)
	if availableVRAM > estimatedVRAM {
		estimatedLayers = int(availableLayers)
	}

	// Calculate estimated VRAM usage

	return &VRAMEstimate{
		TotalVRAM:       availableVRAM,
		AvailableVRAM:   availableVRAM,
		ModelSize:       modelSize,
		EstimatedLayers: estimatedLayers,
		EstimatedVRAM:   estimatedVRAM,
		IsFullOffload:   availableVRAM > estimatedVRAM,
	}, nil
}