feat(loader): enhance single active backend by treating as singleton (#5107)

feat(loader): enhance single active backend by treating at singleton

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2025-04-01 20:58:11 +02:00 committed by GitHub
parent c59975ab05
commit 2c425e9c69
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 92 additions and 71 deletions

View file

@ -16,7 +16,7 @@ type Application struct {
func newApplication(appConfig *config.ApplicationConfig) *Application {
return &Application{
backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
modelLoader: model.NewModelLoader(appConfig.ModelPath),
modelLoader: model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
applicationConfig: appConfig,
templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
}

View file

@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
}()
}
if options.LoadToMemory != nil {
if options.LoadToMemory != nil && !options.SingleBackend {
for _, m := range options.LoadToMemory {
cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
if err != nil {

View file

@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
if err != nil {
return nil, err
}
defer loader.Close()
var fn func() ([]float32, error)
switch model := inferenceModel.(type) {

View file

@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
if err != nil {
return nil, err
}
defer loader.Close()
fn := func() error {
_, err := inferenceModel.GenerateImage(

View file

@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
if err != nil {
return nil, err
}
defer loader.Close()
var protoMessages []*proto.Message
// if we are using the tokenizer template, we need to convert the messages to proto messages

View file

@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
grpcOpts := grpcModelOpts(c)
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
if so.SingleBackend {
defOpts = append(defOpts, model.WithSingleActiveBackend())
}
if so.ParallelBackendRequests {
defOpts = append(defOpts, model.EnableParallelRequests)
}
@ -121,7 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
triggers := make([]*pb.GrammarTrigger, 0)
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
triggers = append(triggers, &pb.GrammarTrigger{
Word: t.Word,
Word: t.Word,
})
}
@ -161,33 +157,33 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
DisableLogStatus: c.DisableLogStatus,
DType: c.DType,
// LimitMMPerPrompt vLLM
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
MMProj: c.MMProj,
FlashAttention: c.FlashAttention,
CacheTypeKey: c.CacheTypeK,
CacheTypeValue: c.CacheTypeV,
NoKVOffload: c.NoKVOffloading,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
MLock: mmlock,
RopeFreqBase: c.RopeFreqBase,
RopeScaling: c.RopeScaling,
Type: c.ModelType,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: embeddings,
LowVRAM: lowVRAM,
NGPULayers: int32(nGPULayers),
MMap: mmap,
MainGPU: c.MainGPU,
Threads: int32(*c.Threads),
TensorSplit: c.TensorSplit,
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
MMProj: c.MMProj,
FlashAttention: c.FlashAttention,
CacheTypeKey: c.CacheTypeK,
CacheTypeValue: c.CacheTypeV,
NoKVOffload: c.NoKVOffloading,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
MLock: mmlock,
RopeFreqBase: c.RopeFreqBase,
RopeScaling: c.RopeScaling,
Type: c.ModelType,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: embeddings,
LowVRAM: lowVRAM,
NGPULayers: int32(nGPULayers),
MMap: mmap,
MainGPU: c.MainGPU,
Threads: int32(*c.Threads),
TensorSplit: c.TensorSplit,
// AutoGPTQ
ModelBaseName: c.AutoGPTQ.ModelBaseName,
Device: c.AutoGPTQ.Device,

View file

@ -12,10 +12,10 @@ import (
func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
opts := ModelOptions(backendConfig, appConfig)
rerankModel, err := loader.Load(opts...)
if err != nil {
return nil, err
}
defer loader.Close()
if rerankModel == nil {
return nil, fmt.Errorf("could not load rerank model")

View file

@ -26,10 +26,10 @@ func SoundGeneration(
opts := ModelOptions(backendConfig, appConfig)
soundGenModel, err := loader.Load(opts...)
if err != nil {
return "", nil, err
}
defer loader.Close()
if soundGenModel == nil {
return "", nil, fmt.Errorf("could not load sound generation model")

View file

@ -20,6 +20,7 @@ func TokenMetrics(
if err != nil {
return nil, err
}
defer loader.Close()
if model == nil {
return nil, fmt.Errorf("could not loadmodel model")

View file

@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
opts := ModelOptions(backendConfig, appConfig)
inferenceModel, err = loader.Load(opts...)
if err != nil {
return schema.TokenizeResponse{}, err
}
defer loader.Close()
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
predictOptions.Prompt = s

View file

@ -24,6 +24,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
if err != nil {
return nil, err
}
defer ml.Close()
if transcriptionModel == nil {
return nil, fmt.Errorf("could not load transcription model")

View file

@ -23,10 +23,10 @@ func ModelTTS(
) (string, *proto.Result, error) {
opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
ttsModel, err := loader.Load(opts...)
if err != nil {
return "", nil, err
}
defer loader.Close()
if ttsModel == nil {
return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)

View file

@ -19,6 +19,8 @@ func VAD(request *schema.VADRequest,
if err != nil {
return nil, err
}
defer ml.Close()
req := proto.VADRequest{
Audio: request.Audio,
}

View file

@ -74,7 +74,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
AssetsDestination: t.BackendAssetsPath,
ExternalGRPCBackends: externalBackends,
}
ml := model.NewModelLoader(opts.ModelPath)
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
defer func() {
err := ml.StopAllGRPC()

View file

@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
}
cl := config.NewBackendConfigLoader(t.ModelsPath)
ml := model.NewModelLoader(opts.ModelPath)
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
return err
}

View file

@ -41,7 +41,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
AudioDir: outputDir,
AssetsDestination: t.BackendAssetsPath,
}
ml := model.NewModelLoader(opts.ModelPath)
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
defer func() {
err := ml.StopAllGRPC()

View file

@ -21,6 +21,7 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
if err != nil {
return err
}
defer sl.Close()
vals := make([][]byte, len(input.Values))
for i, v := range input.Values {
@ -48,6 +49,7 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
if err != nil {
return err
}
defer sl.Close()
if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
return err
@ -69,6 +71,7 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
if err != nil {
return err
}
defer sl.Close()
keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
if err != nil {
@ -100,6 +103,7 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
if err != nil {
return err
}
defer sl.Close()
keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
if err != nil {

View file

@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
cl := &config.BackendConfigLoader{}
//configsDir := "/tmp/localai/configs"
modelPath := "/tmp/localai/model"
var ml = model.NewModelLoader(modelPath)
var ml = model.NewModelLoader(modelPath, false)
appConfig := &config.ApplicationConfig{
ConfigsDir: configsDir,

View file

@ -50,11 +50,10 @@ func RegisterLocalAIRoutes(router *fiber.App,
router.Post("/v1/vad", vadChain...)
// Stores
sl := model.NewModelLoader("")
router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
router.Post("/stores/set", localai.StoresSetEndpoint(ml, appConfig))
router.Post("/stores/delete", localai.StoresDeleteEndpoint(ml, appConfig))
router.Post("/stores/get", localai.StoresGetEndpoint(ml, appConfig))
router.Post("/stores/find", localai.StoresFindEndpoint(ml, appConfig))
if !appConfig.DisableMetrics {
router.Get("/metrics", localai.LocalAIMetricsEndpoint())