fix: race during stop of active backends (#5106)

* chore: drop double call to stop all backends, refactors

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: do lock when cycling to models to delete

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2025-04-01 00:01:10 +02:00 committed by GitHub
parent 2f9203cd2a
commit 05f7004487
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 45 additions and 37 deletions

3
.env
View file

@ -29,6 +29,9 @@
## Enable/Disable single backend (useful if only one GPU is available) ## Enable/Disable single backend (useful if only one GPU is available)
# LOCALAI_SINGLE_ACTIVE_BACKEND=true # LOCALAI_SINGLE_ACTIVE_BACKEND=true
# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
## Specify a build type. Available: cublas, openblas, clblas. ## Specify a build type. Available: cublas, openblas, clblas.
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit. ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM. ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.

View file

@ -473,8 +473,6 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
backend = realBackend backend = realBackend
} }
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
var backendToConsume string var backendToConsume string
switch backend { switch backend {
@ -497,13 +495,17 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
} }
func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) { func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
if !singleActiveBackend {
return
}
// If we can have only one backend active, kill all the others (except external backends) // If we can have only one backend active, kill all the others (except external backends)
if singleActiveBackend {
log.Debug().Msgf("Stopping all backends except '%s'", modelID) // Stop all backends except the one we are going to load
err := ml.StopGRPC(allExcept(modelID)) log.Debug().Msgf("Stopping all backends except '%s'", modelID)
if err != nil { err := ml.StopGRPC(allExcept(modelID))
log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing") if err != nil {
} log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
} }
} }
@ -520,10 +522,12 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
ml.stopActiveBackends(o.modelID, o.singleActiveBackend) ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
// if a backend is defined, return the loader directly
if o.backendString != "" { if o.backendString != "" {
return ml.backendLoader(opts...) return ml.backendLoader(opts...)
} }
// Otherwise scan for backends in the asset directory
var err error var err error
// get backends embedded in the binary // get backends embedded in the binary

View file

@ -142,26 +142,6 @@ func (ml *ModelLoader) LoadModel(modelID, modelName string, loader func(string,
func (ml *ModelLoader) ShutdownModel(modelName string) error { func (ml *ModelLoader) ShutdownModel(modelName string) error {
ml.mu.Lock() ml.mu.Lock()
defer ml.mu.Unlock() defer ml.mu.Unlock()
model, ok := ml.models[modelName]
if !ok {
return fmt.Errorf("model %s not found", modelName)
}
retries := 1
for model.GRPC(false, ml.wd).IsBusy() {
log.Debug().Msgf("%s busy. Waiting.", modelName)
dur := time.Duration(retries*2) * time.Second
if dur > retryTimeout {
dur = retryTimeout
}
time.Sleep(dur)
retries++
if retries > 10 && os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true" {
log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", modelName, retries)
break
}
}
return ml.deleteProcess(modelName) return ml.deleteProcess(modelName)
} }

View file

@ -9,25 +9,43 @@ import (
"strconv" "strconv"
"strings" "strings"
"syscall" "syscall"
"time"
"github.com/hpcloud/tail" "github.com/hpcloud/tail"
process "github.com/mudler/go-processmanager" process "github.com/mudler/go-processmanager"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
) )
var forceBackendShutdown bool = os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true"
func (ml *ModelLoader) deleteProcess(s string) error { func (ml *ModelLoader) deleteProcess(s string) error {
model, ok := ml.models[s]
if !ok {
log.Debug().Msgf("Model %s not found", s)
return fmt.Errorf("model %s not found", s)
}
defer delete(ml.models, s) defer delete(ml.models, s)
retries := 1
for model.GRPC(false, ml.wd).IsBusy() {
log.Debug().Msgf("%s busy. Waiting.", s)
dur := time.Duration(retries*2) * time.Second
if dur > retryTimeout {
dur = retryTimeout
}
time.Sleep(dur)
retries++
if retries > 10 && forceBackendShutdown {
log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", s, retries)
break
}
}
log.Debug().Msgf("Deleting process %s", s) log.Debug().Msgf("Deleting process %s", s)
m, exists := ml.models[s] process := model.Process()
if !exists {
log.Error().Msgf("Model does not exist %s", s)
// Nothing to do
return nil
}
process := m.Process()
if process == nil { if process == nil {
log.Error().Msgf("No process for %s", s) log.Error().Msgf("No process for %s", s)
// Nothing to do as there is no process // Nothing to do as there is no process
@ -44,9 +62,12 @@ func (ml *ModelLoader) deleteProcess(s string) error {
func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error { func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
var err error = nil var err error = nil
ml.mu.Lock()
defer ml.mu.Unlock()
for k, m := range ml.models { for k, m := range ml.models {
if filter(k, m.Process()) { if filter(k, m.Process()) {
e := ml.ShutdownModel(k) e := ml.deleteProcess(k)
err = errors.Join(err, e) err = errors.Join(err, e)
} }
} }