feat: queue up requests if not running parallel requests (#1296)

Return a GRPC which handles a lock in case it is not meant to be parallel. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-22 03:24:59 +00:00 · 2023-11-16 22:20:16 +01:00 · 2023-11-16 22:20:16 +01:00 · 548959b50f
commit 548959b50f
parent 2addb9f99a
5 changed files with 64 additions and 16 deletions
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -121,7 +121,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 		// Wait for the service to start up
 		ready := false
 		for i := 0; i < o.grpcAttempts; i++ {
-			if client.GRPC().HealthCheck(context.Background()) {
+			if client.GRPC(o.parallelRequests).HealthCheck(context.Background()) {
 				log.Debug().Msgf("GRPC Service Ready")
 				ready = true
 				break
@ -140,7 +140,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string

 		log.Debug().Msgf("GRPC: Loading model with options: %+v", options)

-		res, err := client.GRPC().LoadModel(o.context, &options)
+		res, err := client.GRPC(o.parallelRequests).LoadModel(o.context, &options)
 		if err != nil {
 			return "", fmt.Errorf("could not load model: %w", err)
 		}
@ -154,11 +154,11 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string

 func (ml *ModelLoader) resolveAddress(addr ModelAddress, parallel bool) (*grpc.Client, error) {
 	if parallel {
-		return addr.GRPC(), nil
+		return addr.GRPC(parallel), nil
 	}

 	if _, ok := ml.grpcClients[string(addr)]; !ok {
-		ml.grpcClients[string(addr)] = addr.GRPC()
+		ml.grpcClients[string(addr)] = addr.GRPC(parallel)
 	}
 	return ml.grpcClients[string(addr)], nil
 }
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@ -67,8 +67,8 @@ type ModelLoader struct {

 type ModelAddress string

-func (m ModelAddress) GRPC() *grpc.Client {
-	return grpc.NewClient(string(m))
+func (m ModelAddress) GRPC(parallel bool) *grpc.Client {
+	return grpc.NewClient(string(m), parallel)
 }

 func NewModelLoader(modelPath string) *ModelLoader {
@ -147,10 +147,16 @@ func (ml *ModelLoader) ShutdownModel(modelName string) error {
 }

 func (ml *ModelLoader) CheckIsLoaded(s string) ModelAddress {
+	var client *grpc.Client
 	if m, ok := ml.models[s]; ok {
 		log.Debug().Msgf("Model already loaded in memory: %s", s)
+		if c, ok := ml.grpcClients[s]; ok {
+			client = c
+		} else {
+			client = m.GRPC(false)
+		}

-		if !m.GRPC().HealthCheck(context.Background()) {
+		if !client.HealthCheck(context.Background()) {
 			log.Debug().Msgf("GRPC Model not responding: %s", s)
 			if !ml.grpcProcesses[s].IsAlive() {
 				log.Debug().Msgf("GRPC Process is not responding: %s", s)
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@ -17,7 +17,7 @@ import (
 func (ml *ModelLoader) StopAllExcept(s string) {
 	ml.StopGRPC(func(id string, p *process.Process) bool {
 		if id != s {
-			for ml.models[id].GRPC().IsBusy() {
+			for ml.models[id].GRPC(false).IsBusy() {
 				log.Debug().Msgf("%s busy. Waiting.", id)
 				time.Sleep(2 * time.Second)
 			}