Revert #1963 (#2056)

* Revert "fix(fncall): fix regression introduced in #1963 (#2048)" This reverts commit 6b06d4e0af. * Revert "fix: action-tmate back to upstream, dead code removal (#2038)" This reverts commit fdec8a9d00. * Revert "feat(grpc): return consumed token count and update response accordingly (#2035)" This reverts commit e843d7df0e. * Revert "refactor: backend/service split, channel-based llm flow (#1963)" This reverts commit eed5706994. * feat(grpc): return consumed token count and update response accordingly Fixes: #1920 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-20 10:35:01 +00:00 · 2024-04-17 23:33:49 +02:00 · 2024-04-17 23:33:49 +02:00 · af9e5a2d05
commit af9e5a2d05
parent af8c705ecd
52 changed files with 2295 additions and 3065 deletions
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -11,22 +11,17 @@ import (

 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/rs/zerolog/log"

-	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-type LLMRequest struct {
-	Id          int // TODO Remove if not used.
-	Text        string
-	Images      []string
-	RawMessages []schema.Message
-	// TODO: Other Modalities?
+type LLMResponse struct {
+	Response string // should this be []byte?
+	Usage    TokenUsage
 }

 type TokenUsage struct {
@ -34,94 +29,57 @@ type TokenUsage struct {
 	Completion int
 }

-type LLMResponse struct {
-	Request  *LLMRequest
-	Response string // should this be []byte?
-	Usage    TokenUsage
-}
-
-// TODO: Does this belong here or in core/services/openai.go?
-type LLMResponseBundle struct {
-	Request  *schema.OpenAIRequest
-	Response []schema.Choice
-	Usage    TokenUsage
-}
-
-type LLMBackendService struct {
-	bcl        *config.BackendConfigLoader
-	ml         *model.ModelLoader
-	appConfig  *config.ApplicationConfig
-	ftMutex    sync.Mutex
-	cutstrings map[string]*regexp.Regexp
-}
-
-func NewLLMBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *LLMBackendService {
-	return &LLMBackendService{
-		bcl:        bcl,
-		ml:         ml,
-		appConfig:  appConfig,
-		ftMutex:    sync.Mutex{},
-		cutstrings: make(map[string]*regexp.Regexp),
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+	modelFile := c.Model
+	threads := c.Threads
+	if *threads == 0 && o.Threads != 0 {
+		threads = &o.Threads
 	}
-}
-
-// TODO: Should ctx param be removed and replaced with hardcoded req.Context?
-func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest, bc *config.BackendConfig, enableTokenChannel bool) (
-	resultChannel <-chan concurrency.ErrorOr[*LLMResponse], tokenChannel <-chan concurrency.ErrorOr[*LLMResponse], err error) {
-
-	threads := bc.Threads
-	if (threads == nil || *threads == 0) && llmbs.appConfig.Threads != 0 {
-		threads = &llmbs.appConfig.Threads
-	}
-
-	grpcOpts := gRPCModelOpts(bc)
+	grpcOpts := gRPCModelOpts(c)

 	var inferenceModel grpc.Backend
+	var err error

-	opts := modelOpts(bc, llmbs.appConfig, []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
-		model.WithAssetDir(llmbs.appConfig.AssetsDestination),
-		model.WithModel(bc.Model),
-		model.WithContext(llmbs.appConfig.Context),
+		model.WithAssetDir(o.AssetsDestination),
+		model.WithModel(modelFile),
+		model.WithContext(o.Context),
 	})

-	if bc.Backend != "" {
-		opts = append(opts, model.WithBackendString(bc.Backend))
+	if c.Backend != "" {
+		opts = append(opts, model.WithBackendString(c.Backend))
 	}

-	// Check if bc.Model exists, if it doesn't try to load it from the gallery
-	if llmbs.appConfig.AutoloadGalleries { // experimental
-		if _, err := os.Stat(bc.Model); os.IsNotExist(err) {
+	// Check if the modelFile exists, if it doesn't try to load it from the gallery
+	if o.AutoloadGalleries { // experimental
+		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
 			utils.ResetDownloadTimers()
 			// if we failed to load the model, we try to download it
-			err := gallery.InstallModelFromGalleryByName(llmbs.appConfig.Galleries, bc.Model, llmbs.appConfig.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
+			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
 			if err != nil {
-				return nil, nil, err
+				return nil, err
 			}
 		}
 	}

-	if bc.Backend == "" {
-		log.Debug().Msgf("backend not known for %q, falling back to greedy loader to find it", bc.Model)
-		inferenceModel, err = llmbs.ml.GreedyLoader(opts...)
+	if c.Backend == "" {
+		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
-		inferenceModel, err = llmbs.ml.BackendLoader(opts...)
+		inferenceModel, err = loader.BackendLoader(opts...)
 	}

 	if err != nil {
-		log.Error().Err(err).Msg("[llmbs.Inference] failed to load a backend")
-		return
+		return nil, err
 	}

-	grpcPredOpts := gRPCPredictOpts(bc, llmbs.appConfig.ModelPath)
-	grpcPredOpts.Prompt = req.Text
-	grpcPredOpts.Images = req.Images
-
-	if bc.TemplateConfig.UseTokenizerTemplate && req.Text == "" {
-		grpcPredOpts.UseTokenizerTemplate = true
-		protoMessages := make([]*proto.Message, len(req.RawMessages), len(req.RawMessages))
-		for i, message := range req.RawMessages {
+	var protoMessages []*proto.Message
+	// if we are using the tokenizer template, we need to convert the messages to proto messages
+	// unless the prompt has already been tokenized (non-chat endpoints + functions)
+	if c.TemplateConfig.UseTokenizerTemplate && s == "" {
+		protoMessages = make([]*proto.Message, len(messages), len(messages))
+		for i, message := range messages {
 			protoMessages[i] = &proto.Message{
 				Role: message.Role,
 			}
@ -129,32 +87,47 @@ func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest,
 			case string:
 				protoMessages[i].Content = ct
 			default:
-				err = fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
-				return
+				return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
 			}
 		}
 	}

-	tokenUsage := TokenUsage{}
+	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
+	fn := func() (LLMResponse, error) {
+		opts := gRPCPredictOpts(c, loader.ModelPath)
+		opts.Prompt = s
+		opts.Messages = protoMessages
+		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
+		opts.Images = images

-	promptInfo, pErr := inferenceModel.TokenizeString(ctx, grpcPredOpts)
-	if pErr == nil && promptInfo.Length > 0 {
-		tokenUsage.Prompt = int(promptInfo.Length)
-	}
+		tokenUsage := TokenUsage{}

-	rawResultChannel := make(chan concurrency.ErrorOr[*LLMResponse])
-	// TODO this next line is the biggest argument for taking named return values _back_ out!!!
-	var rawTokenChannel chan concurrency.ErrorOr[*LLMResponse]
+		// check the per-model feature flag for usage, since tokenCallback may have a cost.
+		// Defaults to off as for now it is still experimental
+		if c.FeatureFlag.Enabled("usage") {
+			userTokenCallback := tokenCallback
+			if userTokenCallback == nil {
+				userTokenCallback = func(token string, usage TokenUsage) bool {
+					return true
+				}
+			}

-	if enableTokenChannel {
-		rawTokenChannel = make(chan concurrency.ErrorOr[*LLMResponse])
+			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
+			if pErr == nil && promptInfo.Length > 0 {
+				tokenUsage.Prompt = int(promptInfo.Length)
+			}

-		// TODO Needs better name
-		ss := ""
+			tokenCallback = func(token string, usage TokenUsage) bool {
+				tokenUsage.Completion++
+				return userTokenCallback(token, tokenUsage)
+			}
+		}
+
+		if tokenCallback != nil {
+			ss := ""

-		go func() {
 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, grpcPredOpts, func(chars []byte) {
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
 				partialRune = append(partialRune, chars...)

 				for len(partialRune) > 0 {
@ -164,126 +137,54 @@ func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest,
 						break
 					}

-					tokenUsage.Completion++
-					rawTokenChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
-						Response: string(r),
-						Usage:    tokenUsage,
-					}}
-
+					tokenCallback(string(r), tokenUsage)
 					ss += string(r)

 					partialRune = partialRune[size:]
 				}
 			})
-			close(rawTokenChannel)
+			return LLMResponse{
+				Response: ss,
+				Usage:    tokenUsage,
+			}, err
+		} else {
+			// TODO: Is the chicken bit the only way to get here? is that acceptable?
+			reply, err := inferenceModel.Predict(ctx, opts)
 			if err != nil {
-				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
-			} else {
-				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
-					Response: ss,
-					Usage:    tokenUsage,
-				}}
+				return LLMResponse{}, err
 			}
-			close(rawResultChannel)
-		}()
-	} else {
-		go func() {
-			reply, err := inferenceModel.Predict(ctx, grpcPredOpts)
 			if tokenUsage.Prompt == 0 {
 				tokenUsage.Prompt = int(reply.PromptTokens)
 			}
 			if tokenUsage.Completion == 0 {
 				tokenUsage.Completion = int(reply.Tokens)
 			}
-			if err != nil {
-				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
-				close(rawResultChannel)
-			} else {
-				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
-					Response: string(reply.Message),
-					Usage:    tokenUsage,
-				}}
-				close(rawResultChannel)
-			}
-		}()
+			return LLMResponse{
+				Response: string(reply.Message),
+				Usage:    tokenUsage,
+			}, err
+		}
 	}

-	resultChannel = rawResultChannel
-	tokenChannel = rawTokenChannel
-	return
+	return fn, nil
 }

-// TODO: Should predInput be a seperate param still, or should this fn handle extracting it from request??
-func (llmbs *LLMBackendService) GenerateText(predInput string, request *schema.OpenAIRequest, bc *config.BackendConfig,
-	mappingFn func(*LLMResponse) schema.Choice, enableCompletionChannels bool, enableTokenChannels bool) (
-	// Returns:
-	resultChannel <-chan concurrency.ErrorOr[*LLMResponseBundle], completionChannels []<-chan concurrency.ErrorOr[*LLMResponse], tokenChannels []<-chan concurrency.ErrorOr[*LLMResponse], err error) {
+var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
+var mu sync.Mutex = sync.Mutex{}

-	rawChannel := make(chan concurrency.ErrorOr[*LLMResponseBundle])
-	resultChannel = rawChannel
-
-	if request.N == 0 { // number of completions to return
-		request.N = 1
-	}
-	images := []string{}
-	for _, m := range request.Messages {
-		images = append(images, m.StringImages...)
-	}
-
-	for i := 0; i < request.N; i++ {
-
-		individualResultChannel, tokenChannel, infErr := llmbs.Inference(request.Context, &LLMRequest{
-			Text:        predInput,
-			Images:      images,
-			RawMessages: request.Messages,
-		}, bc, enableTokenChannels)
-		if infErr != nil {
-			err = infErr // Avoids complaints about redeclaring err but looks dumb
-			return
-		}
-		completionChannels = append(completionChannels, individualResultChannel)
-		tokenChannels = append(tokenChannels, tokenChannel)
-	}
-
-	go func() {
-		initialBundle := LLMResponseBundle{
-			Request:  request,
-			Response: []schema.Choice{},
-			Usage:    TokenUsage{},
-		}
-
-		wg := concurrency.SliceOfChannelsReducer(completionChannels, rawChannel, func(iv concurrency.ErrorOr[*LLMResponse], ov concurrency.ErrorOr[*LLMResponseBundle]) concurrency.ErrorOr[*LLMResponseBundle] {
-			if iv.Error != nil {
-				ov.Error = iv.Error
-				// TODO: Decide if we should wipe partials or not?
-				return ov
-			}
-			ov.Value.Usage.Prompt += iv.Value.Usage.Prompt
-			ov.Value.Usage.Completion += iv.Value.Usage.Completion
-
-			ov.Value.Response = append(ov.Value.Response, mappingFn(iv.Value))
-			return ov
-		}, concurrency.ErrorOr[*LLMResponseBundle]{Value: &initialBundle}, true)
-		wg.Wait()
-
-	}()
-
-	return
-}
-
-func (llmbs *LLMBackendService) Finetune(config config.BackendConfig, input, prediction string) string {
+func Finetune(config config.BackendConfig, input, prediction string) string {
 	if config.Echo {
 		prediction = input + prediction
 	}

 	for _, c := range config.Cutstrings {
-		llmbs.ftMutex.Lock()
-		reg, ok := llmbs.cutstrings[c]
+		mu.Lock()
+		reg, ok := cutstrings[c]
 		if !ok {
-			llmbs.cutstrings[c] = regexp.MustCompile(c)
-			reg = llmbs.cutstrings[c]
+			cutstrings[c] = regexp.MustCompile(c)
+			reg = cutstrings[c]
 		}
-		llmbs.ftMutex.Unlock()
+		mu.Unlock()
 		prediction = reg.ReplaceAllString(prediction, "")
 	}