feat: queue up requests if not running parallel requests (#1296)

Return a GRPC which handles a lock in case it is not meant to be parallel. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-20 10:35:01 +00:00 · 2023-11-16 22:20:16 +01:00 · 2023-11-16 22:20:16 +01:00 · 548959b50f
commit 548959b50f
parent 2addb9f99a
5 changed files with 64 additions and 16 deletions
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@ -67,8 +67,8 @@ type ModelLoader struct {

 type ModelAddress string

-func (m ModelAddress) GRPC() *grpc.Client {
-	return grpc.NewClient(string(m))
+func (m ModelAddress) GRPC(parallel bool) *grpc.Client {
+	return grpc.NewClient(string(m), parallel)
 }

 func NewModelLoader(modelPath string) *ModelLoader {
@ -147,10 +147,16 @@ func (ml *ModelLoader) ShutdownModel(modelName string) error {
 }

 func (ml *ModelLoader) CheckIsLoaded(s string) ModelAddress {
+	var client *grpc.Client
 	if m, ok := ml.models[s]; ok {
 		log.Debug().Msgf("Model already loaded in memory: %s", s)
+		if c, ok := ml.grpcClients[s]; ok {
+			client = c
+		} else {
+			client = m.GRPC(false)
+		}

-		if !m.GRPC().HealthCheck(context.Background()) {
+		if !client.HealthCheck(context.Background()) {
 			log.Debug().Msgf("GRPC Model not responding: %s", s)
 			if !ml.grpcProcesses[s].IsAlive() {
 				log.Debug().Msgf("GRPC Process is not responding: %s", s)